diff --git a/DEPS b/DEPS index 21025f25e0..dcf924f9fc 100644 --- a/DEPS +++ b/DEPS @@ -43,7 +43,7 @@ deps = { # Make sure the version matches the one in # src/packager/third_party/boringssl, which contains perl generated files. "src/packager/third_party/boringssl/src": - Var("github") + "/google/boringssl@fc9c67599d9bdeb2e0467085133b81a8e28f77a4", + Var("github") + "/google/boringssl@76918d016414bf1d71a86d28239566fbcf8aacf0", "src/packager/third_party/curl/source": Var("github") + "/curl/curl@62c07b5743490ce373910f469abc8cdc759bec2b", #7.57.0 diff --git a/packager/third_party/boringssl/BUILD.generated.gni b/packager/third_party/boringssl/BUILD.generated.gni index f59afac5c3..56bc2574ea 100644 --- a/packager/third_party/boringssl/BUILD.generated.gni +++ b/packager/third_party/boringssl/BUILD.generated.gni @@ -57,16 +57,18 @@ crypto_sources = [ "src/crypto/bytestring/cbb.c", "src/crypto/bytestring/cbs.c", "src/crypto/bytestring/internal.h", + "src/crypto/bytestring/unicode.c", "src/crypto/chacha/chacha.c", + "src/crypto/chacha/internal.h", "src/crypto/cipher_extra/cipher_extra.c", "src/crypto/cipher_extra/derive_key.c", + "src/crypto/cipher_extra/e_aesccm.c", "src/crypto/cipher_extra/e_aesctrhmac.c", "src/crypto/cipher_extra/e_aesgcmsiv.c", "src/crypto/cipher_extra/e_chacha20poly1305.c", "src/crypto/cipher_extra/e_null.c", "src/crypto/cipher_extra/e_rc2.c", "src/crypto/cipher_extra/e_rc4.c", - "src/crypto/cipher_extra/e_ssl3.c", "src/crypto/cipher_extra/e_tls.c", "src/crypto/cipher_extra/internal.h", "src/crypto/cipher_extra/tls_cbc.c", @@ -74,14 +76,15 @@ crypto_sources = [ "src/crypto/conf/conf.c", "src/crypto/conf/conf_def.h", "src/crypto/conf/internal.h", + "src/crypto/cpu-aarch64-fuchsia.c", "src/crypto/cpu-aarch64-linux.c", "src/crypto/cpu-arm-linux.c", + "src/crypto/cpu-arm-linux.h", "src/crypto/cpu-arm.c", "src/crypto/cpu-intel.c", "src/crypto/cpu-ppc64le.c", "src/crypto/crypto.c", "src/crypto/curve25519/spake25519.c", - "src/crypto/curve25519/x25519-x86_64.c", "src/crypto/dh/check.c", "src/crypto/dh/dh.c", "src/crypto/dh/dh_asn1.c", @@ -90,7 +93,8 @@ crypto_sources = [ "src/crypto/dsa/dsa.c", "src/crypto/dsa/dsa_asn1.c", "src/crypto/ec_extra/ec_asn1.c", - "src/crypto/ecdh/ecdh.c", + "src/crypto/ec_extra/ec_derive.c", + "src/crypto/ecdh_extra/ecdh_extra.c", "src/crypto/ecdsa_extra/ecdsa_asn1.c", "src/crypto/engine/engine.c", "src/crypto/err/err.c", @@ -107,6 +111,8 @@ crypto_sources = [ "src/crypto/evp/p_ed25519_asn1.c", "src/crypto/evp/p_rsa.c", "src/crypto/evp/p_rsa_asn1.c", + "src/crypto/evp/p_x25519.c", + "src/crypto/evp/p_x25519_asn1.c", "src/crypto/evp/pbkdf.c", "src/crypto/evp/print.c", "src/crypto/evp/scrypt.c", @@ -124,11 +130,17 @@ crypto_sources = [ "src/crypto/fipsmodule/ec/internal.h", "src/crypto/fipsmodule/ec/p256-x86_64-table.h", "src/crypto/fipsmodule/ec/p256-x86_64.h", + "src/crypto/fipsmodule/fips_shared_support.c", "src/crypto/fipsmodule/is_fips.c", + "src/crypto/fipsmodule/md5/internal.h", "src/crypto/fipsmodule/modes/internal.h", "src/crypto/fipsmodule/rand/internal.h", "src/crypto/fipsmodule/rsa/internal.h", + "src/crypto/fipsmodule/sha/internal.h", + "src/crypto/fipsmodule/tls/internal.h", "src/crypto/hkdf/hkdf.c", + "src/crypto/hrss/hrss.c", + "src/crypto/hrss/internal.h", "src/crypto/internal.h", "src/crypto/lhash/lhash.c", "src/crypto/mem.c", @@ -165,6 +177,8 @@ crypto_sources = [ "src/crypto/refcount_c11.c", "src/crypto/refcount_lock.c", "src/crypto/rsa_extra/rsa_asn1.c", + "src/crypto/rsa_extra/rsa_print.c", + "src/crypto/siphash/siphash.c", "src/crypto/stack/stack.c", "src/crypto/thread.c", "src/crypto/thread_none.c", @@ -223,6 +237,7 @@ crypto_sources = [ "src/crypto/x509/x_x509.c", "src/crypto/x509/x_x509a.c", "src/crypto/x509v3/ext_dat.h", + "src/crypto/x509v3/internal.h", "src/crypto/x509v3/pcy_cache.c", "src/crypto/x509v3/pcy_data.c", "src/crypto/x509v3/pcy_int.h", @@ -246,6 +261,7 @@ crypto_sources = [ "src/crypto/x509v3/v3_int.c", "src/crypto/x509v3/v3_lib.c", "src/crypto/x509v3/v3_ncons.c", + "src/crypto/x509v3/v3_ocsp.c", "src/crypto/x509v3/v3_pci.c", "src/crypto/x509v3/v3_pcia.c", "src/crypto/x509v3/v3_pcons.c", @@ -256,6 +272,25 @@ crypto_sources = [ "src/crypto/x509v3/v3_skey.c", "src/crypto/x509v3/v3_sxnet.c", "src/crypto/x509v3/v3_utl.c", + "src/third_party/fiat/curve25519.c", + "src/third_party/fiat/curve25519_32.h", + "src/third_party/fiat/curve25519_64.h", + "src/third_party/fiat/curve25519_tables.h", + "src/third_party/fiat/internal.h", + "src/third_party/fiat/p256_32.h", + "src/third_party/fiat/p256_64.h", + "src/third_party/sike/asm/fp_generic.c", + "src/third_party/sike/curve_params.c", + "src/third_party/sike/fpx.c", + "src/third_party/sike/fpx.h", + "src/third_party/sike/isogeny.c", + "src/third_party/sike/isogeny.h", + "src/third_party/sike/sike.c", + "src/third_party/sike/sike.h", + "src/third_party/sike/utils.h", +] + +crypto_headers = [ "src/include/openssl/aead.h", "src/include/openssl/aes.h", "src/include/openssl/arm_arch.h", @@ -282,6 +317,7 @@ crypto_sources = [ "src/include/openssl/dh.h", "src/include/openssl/digest.h", "src/include/openssl/dsa.h", + "src/include/openssl/e_os2.h", "src/include/openssl/ec.h", "src/include/openssl/ec_key.h", "src/include/openssl/ecdh.h", @@ -292,9 +328,9 @@ crypto_sources = [ "src/include/openssl/ex_data.h", "src/include/openssl/hkdf.h", "src/include/openssl/hmac.h", + "src/include/openssl/hrss.h", "src/include/openssl/is_boringssl.h", "src/include/openssl/lhash.h", - "src/include/openssl/lhash_macros.h", "src/include/openssl/md4.h", "src/include/openssl/md5.h", "src/include/openssl/mem.h", @@ -317,31 +353,25 @@ crypto_sources = [ "src/include/openssl/rsa.h", "src/include/openssl/safestack.h", "src/include/openssl/sha.h", + "src/include/openssl/siphash.h", "src/include/openssl/span.h", - "src/include/openssl/srtp.h", "src/include/openssl/stack.h", "src/include/openssl/thread.h", "src/include/openssl/type_check.h", "src/include/openssl/x509.h", "src/include/openssl/x509_vfy.h", "src/include/openssl/x509v3.h", - "src/third_party/fiat/curve25519.c", - "src/third_party/fiat/internal.h", ] ssl_sources = [ - "src/include/openssl/dtls1.h", - "src/include/openssl/ssl.h", - "src/include/openssl/ssl3.h", - "src/include/openssl/tls1.h", "src/ssl/bio_ssl.cc", - "src/ssl/custom_extensions.cc", "src/ssl/d1_both.cc", "src/ssl/d1_lib.cc", "src/ssl/d1_pkt.cc", "src/ssl/d1_srtp.cc", "src/ssl/dtls_method.cc", "src/ssl/dtls_record.cc", + "src/ssl/handoff.cc", "src/ssl/handshake.cc", "src/ssl/handshake_client.cc", "src/ssl/handshake_server.cc", @@ -373,14 +403,26 @@ ssl_sources = [ "src/ssl/tls_record.cc", ] +ssl_headers = [ + "src/include/openssl/dtls1.h", + "src/include/openssl/srtp.h", + "src/include/openssl/ssl.h", + "src/include/openssl/ssl3.h", + "src/include/openssl/tls1.h", +] + crypto_sources_ios_aarch64 = [ "ios-aarch64/crypto/chacha/chacha-armv8.S", "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S", "ios-aarch64/crypto/fipsmodule/armv8-mont.S", + "ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S", "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S", "ios-aarch64/crypto/fipsmodule/sha1-armv8.S", "ios-aarch64/crypto/fipsmodule/sha256-armv8.S", "ios-aarch64/crypto/fipsmodule/sha512-armv8.S", + "ios-aarch64/crypto/fipsmodule/vpaes-armv8.S", + "ios-aarch64/crypto/test/trampoline-armv8.S", + "ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S", ] crypto_sources_ios_arm = [ @@ -394,16 +436,22 @@ crypto_sources_ios_arm = [ "ios-arm/crypto/fipsmodule/sha1-armv4-large.S", "ios-arm/crypto/fipsmodule/sha256-armv4.S", "ios-arm/crypto/fipsmodule/sha512-armv4.S", + "ios-arm/crypto/fipsmodule/vpaes-armv7.S", + "ios-arm/crypto/test/trampoline-armv4.S", ] crypto_sources_linux_aarch64 = [ "linux-aarch64/crypto/chacha/chacha-armv8.S", "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S", "linux-aarch64/crypto/fipsmodule/armv8-mont.S", + "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S", "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S", "linux-aarch64/crypto/fipsmodule/sha1-armv8.S", "linux-aarch64/crypto/fipsmodule/sha256-armv8.S", "linux-aarch64/crypto/fipsmodule/sha512-armv8.S", + "linux-aarch64/crypto/fipsmodule/vpaes-armv8.S", + "linux-aarch64/crypto/test/trampoline-armv8.S", + "linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S", ] crypto_sources_linux_arm = [ @@ -417,6 +465,8 @@ crypto_sources_linux_arm = [ "linux-arm/crypto/fipsmodule/sha1-armv4-large.S", "linux-arm/crypto/fipsmodule/sha256-armv4.S", "linux-arm/crypto/fipsmodule/sha512-armv4.S", + "linux-arm/crypto/fipsmodule/vpaes-armv7.S", + "linux-arm/crypto/test/trampoline-armv4.S", "src/crypto/curve25519/asm/x25519-asm-arm.S", "src/crypto/poly1305/poly1305_arm_asm.S", ] @@ -432,6 +482,7 @@ crypto_sources_linux_x86 = [ "linux-x86/crypto/fipsmodule/aesni-x86.S", "linux-x86/crypto/fipsmodule/bn-586.S", "linux-x86/crypto/fipsmodule/co-586.S", + "linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S", "linux-x86/crypto/fipsmodule/ghash-x86.S", "linux-x86/crypto/fipsmodule/md5-586.S", "linux-x86/crypto/fipsmodule/sha1-586.S", @@ -439,6 +490,7 @@ crypto_sources_linux_x86 = [ "linux-x86/crypto/fipsmodule/sha512-586.S", "linux-x86/crypto/fipsmodule/vpaes-x86.S", "linux-x86/crypto/fipsmodule/x86-mont.S", + "linux-x86/crypto/test/trampoline-x86.S", ] crypto_sources_linux_x86_64 = [ @@ -448,10 +500,11 @@ crypto_sources_linux_x86_64 = [ "linux-x86_64/crypto/fipsmodule/aes-x86_64.S", "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", "linux-x86_64/crypto/fipsmodule/aesni-x86_64.S", - "linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S", + "linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S", "linux-x86_64/crypto/fipsmodule/ghash-x86_64.S", "linux-x86_64/crypto/fipsmodule/md5-x86_64.S", "linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S", + "linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S", "linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S", "linux-x86_64/crypto/fipsmodule/rsaz-avx2.S", "linux-x86_64/crypto/fipsmodule/sha1-x86_64.S", @@ -460,7 +513,9 @@ crypto_sources_linux_x86_64 = [ "linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S", "linux-x86_64/crypto/fipsmodule/x86_64-mont.S", "linux-x86_64/crypto/fipsmodule/x86_64-mont5.S", - "src/crypto/curve25519/asm/x25519-asm-x86_64.S", + "linux-x86_64/crypto/test/trampoline-x86_64.S", + "linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S", + "src/crypto/hrss/asm/poly_rq_mul.S", ] crypto_sources_mac_x86 = [ @@ -469,6 +524,7 @@ crypto_sources_mac_x86 = [ "mac-x86/crypto/fipsmodule/aesni-x86.S", "mac-x86/crypto/fipsmodule/bn-586.S", "mac-x86/crypto/fipsmodule/co-586.S", + "mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S", "mac-x86/crypto/fipsmodule/ghash-x86.S", "mac-x86/crypto/fipsmodule/md5-586.S", "mac-x86/crypto/fipsmodule/sha1-586.S", @@ -476,6 +532,7 @@ crypto_sources_mac_x86 = [ "mac-x86/crypto/fipsmodule/sha512-586.S", "mac-x86/crypto/fipsmodule/vpaes-x86.S", "mac-x86/crypto/fipsmodule/x86-mont.S", + "mac-x86/crypto/test/trampoline-x86.S", ] crypto_sources_mac_x86_64 = [ @@ -485,10 +542,11 @@ crypto_sources_mac_x86_64 = [ "mac-x86_64/crypto/fipsmodule/aes-x86_64.S", "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S", - "mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S", + "mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S", "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S", "mac-x86_64/crypto/fipsmodule/md5-x86_64.S", "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S", + "mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S", "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S", "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S", "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S", @@ -497,7 +555,8 @@ crypto_sources_mac_x86_64 = [ "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S", "mac-x86_64/crypto/fipsmodule/x86_64-mont.S", "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S", - "src/crypto/curve25519/asm/x25519-asm-x86_64.S", + "mac-x86_64/crypto/test/trampoline-x86_64.S", + "mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S", ] crypto_sources_win_x86 = [ @@ -506,6 +565,7 @@ crypto_sources_win_x86 = [ "win-x86/crypto/fipsmodule/aesni-x86.asm", "win-x86/crypto/fipsmodule/bn-586.asm", "win-x86/crypto/fipsmodule/co-586.asm", + "win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm", "win-x86/crypto/fipsmodule/ghash-x86.asm", "win-x86/crypto/fipsmodule/md5-586.asm", "win-x86/crypto/fipsmodule/sha1-586.asm", @@ -513,6 +573,7 @@ crypto_sources_win_x86 = [ "win-x86/crypto/fipsmodule/sha512-586.asm", "win-x86/crypto/fipsmodule/vpaes-x86.asm", "win-x86/crypto/fipsmodule/x86-mont.asm", + "win-x86/crypto/test/trampoline-x86.asm", ] crypto_sources_win_x86_64 = [ @@ -522,10 +583,11 @@ crypto_sources_win_x86_64 = [ "win-x86_64/crypto/fipsmodule/aes-x86_64.asm", "win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm", "win-x86_64/crypto/fipsmodule/aesni-x86_64.asm", - "win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm", + "win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm", "win-x86_64/crypto/fipsmodule/ghash-x86_64.asm", "win-x86_64/crypto/fipsmodule/md5-x86_64.asm", "win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm", + "win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm", "win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm", "win-x86_64/crypto/fipsmodule/rsaz-avx2.asm", "win-x86_64/crypto/fipsmodule/sha1-x86_64.asm", @@ -534,15 +596,19 @@ crypto_sources_win_x86_64 = [ "win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm", "win-x86_64/crypto/fipsmodule/x86_64-mont.asm", "win-x86_64/crypto/fipsmodule/x86_64-mont5.asm", + "win-x86_64/crypto/test/trampoline-x86_64.asm", + "win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm", ] fuzzers = [ + "arm_cpuinfo", "bn_div", "bn_mod_exp", "cert", "client", "dtls_client", "dtls_server", + "pkcs12", "pkcs8", "privkey", "read_pem", diff --git a/packager/third_party/boringssl/BUILD.generated_tests.gni b/packager/third_party/boringssl/BUILD.generated_tests.gni index 44d653d281..c6d2db8a6b 100644 --- a/packager/third_party/boringssl/BUILD.generated_tests.gni +++ b/packager/third_party/boringssl/BUILD.generated_tests.gni @@ -5,21 +5,28 @@ # This file is created by generate_build_files.py. Do not edit manually. test_support_sources = [ + "src/crypto/test/abi_test.h", "src/crypto/test/file_test.cc", "src/crypto/test/file_test.h", "src/crypto/test/gtest_main.h", "src/crypto/test/malloc.cc", "src/crypto/test/test_util.cc", "src/crypto/test/test_util.h", + "src/crypto/test/wycheproof_util.cc", + "src/crypto/test/wycheproof_util.h", "src/ssl/test/async_bio.h", "src/ssl/test/fuzzer.h", "src/ssl/test/fuzzer_tags.h", + "src/ssl/test/handshake_util.h", "src/ssl/test/packeted_bio.h", + "src/ssl/test/settings_writer.h", "src/ssl/test/test_config.h", + "src/ssl/test/test_state.h", ] crypto_test_sources = [ "crypto_test_data.cc", + "src/crypto/abi_self_test.cc", "src/crypto/asn1/asn1_test.cc", "src/crypto/base64/base64_test.cc", "src/crypto/bio/bio_test.cc", @@ -31,13 +38,14 @@ crypto_test_sources = [ "src/crypto/cmac/cmac_test.cc", "src/crypto/compiler_test.cc", "src/crypto/constant_time_test.cc", + "src/crypto/cpu-arm-linux_test.cc", "src/crypto/curve25519/ed25519_test.cc", "src/crypto/curve25519/spake25519_test.cc", "src/crypto/curve25519/x25519_test.cc", "src/crypto/dh/dh_test.cc", "src/crypto/digest_extra/digest_test.cc", "src/crypto/dsa/dsa_test.cc", - "src/crypto/ecdh/ecdh_test.cc", + "src/crypto/ecdh_extra/ecdh_test.cc", "src/crypto/err/err_test.cc", "src/crypto/evp/evp_extra_test.cc", "src/crypto/evp/evp_test.cc", @@ -48,29 +56,133 @@ crypto_test_sources = [ "src/crypto/fipsmodule/ec/ec_test.cc", "src/crypto/fipsmodule/ec/p256-x86_64_test.cc", "src/crypto/fipsmodule/ecdsa/ecdsa_test.cc", + "src/crypto/fipsmodule/md5/md5_test.cc", "src/crypto/fipsmodule/modes/gcm_test.cc", "src/crypto/fipsmodule/rand/ctrdrbg_test.cc", + "src/crypto/fipsmodule/sha/sha_test.cc", "src/crypto/hkdf/hkdf_test.cc", "src/crypto/hmac_extra/hmac_test.cc", + "src/crypto/hrss/hrss_test.cc", + "src/crypto/impl_dispatch_test.cc", "src/crypto/lhash/lhash_test.cc", "src/crypto/obj/obj_test.cc", + "src/crypto/pem/pem_test.cc", "src/crypto/pkcs7/pkcs7_test.cc", "src/crypto/pkcs8/pkcs12_test.cc", "src/crypto/pkcs8/pkcs8_test.cc", "src/crypto/poly1305/poly1305_test.cc", "src/crypto/pool/pool_test.cc", + "src/crypto/rand_extra/rand_test.cc", "src/crypto/refcount_test.cc", "src/crypto/rsa_extra/rsa_test.cc", + "src/crypto/self_test.cc", + "src/crypto/siphash/siphash_test.cc", + "src/crypto/stack/stack_test.cc", + "src/crypto/test/abi_test.cc", "src/crypto/test/file_test_gtest.cc", "src/crypto/test/gtest_main.cc", "src/crypto/thread_test.cc", "src/crypto/x509/x509_test.cc", + "src/crypto/x509/x509_time_test.cc", "src/crypto/x509v3/tab_test.cc", "src/crypto/x509v3/v3name_test.cc", ] +crypto_test_data = [ + "src/crypto/cipher_extra/test/aes_128_cbc_sha1_tls_implicit_iv_tests.txt", + "src/crypto/cipher_extra/test/aes_128_cbc_sha1_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_128_cbc_sha256_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_128_ccm_bluetooth_8_tests.txt", + "src/crypto/cipher_extra/test/aes_128_ccm_bluetooth_tests.txt", + "src/crypto/cipher_extra/test/aes_128_ctr_hmac_sha256.txt", + "src/crypto/cipher_extra/test/aes_128_gcm_siv_tests.txt", + "src/crypto/cipher_extra/test/aes_128_gcm_tests.txt", + "src/crypto/cipher_extra/test/aes_192_gcm_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha1_tls_implicit_iv_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha1_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha256_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha384_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_256_ctr_hmac_sha256.txt", + "src/crypto/cipher_extra/test/aes_256_gcm_siv_tests.txt", + "src/crypto/cipher_extra/test/aes_256_gcm_tests.txt", + "src/crypto/cipher_extra/test/chacha20_poly1305_tests.txt", + "src/crypto/cipher_extra/test/cipher_tests.txt", + "src/crypto/cipher_extra/test/des_ede3_cbc_sha1_tls_implicit_iv_tests.txt", + "src/crypto/cipher_extra/test/des_ede3_cbc_sha1_tls_tests.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_128_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_128_ctr.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_128_gcm.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_192_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_192_ctr.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_256_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_256_ctr.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_256_gcm.txt", + "src/crypto/cipher_extra/test/nist_cavp/tdes_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/tdes_ecb.txt", + "src/crypto/cipher_extra/test/xchacha20_poly1305_tests.txt", + "src/crypto/cmac/cavp_3des_cmac_tests.txt", + "src/crypto/cmac/cavp_aes128_cmac_tests.txt", + "src/crypto/cmac/cavp_aes192_cmac_tests.txt", + "src/crypto/cmac/cavp_aes256_cmac_tests.txt", + "src/crypto/curve25519/ed25519_tests.txt", + "src/crypto/ecdh_extra/ecdh_tests.txt", + "src/crypto/evp/evp_tests.txt", + "src/crypto/evp/scrypt_tests.txt", + "src/crypto/fipsmodule/aes/aes_tests.txt", + "src/crypto/fipsmodule/bn/bn_tests.txt", + "src/crypto/fipsmodule/bn/miller_rabin_tests.txt", + "src/crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt", + "src/crypto/fipsmodule/ec/p256-x86_64_tests.txt", + "src/crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt", + "src/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt", + "src/crypto/fipsmodule/modes/gcm_tests.txt", + "src/crypto/fipsmodule/rand/ctrdrbg_vectors.txt", + "src/crypto/hmac_extra/hmac_tests.txt", + "src/crypto/poly1305/poly1305_tests.txt", + "src/crypto/siphash/siphash_tests.txt", + "src/crypto/x509/many_constraints.pem", + "src/crypto/x509/many_names1.pem", + "src/crypto/x509/many_names2.pem", + "src/crypto/x509/many_names3.pem", + "src/crypto/x509/some_names1.pem", + "src/crypto/x509/some_names2.pem", + "src/crypto/x509/some_names3.pem", + "src/third_party/wycheproof_testvectors/aes_cbc_pkcs5_test.txt", + "src/third_party/wycheproof_testvectors/aes_cmac_test.txt", + "src/third_party/wycheproof_testvectors/aes_gcm_siv_test.txt", + "src/third_party/wycheproof_testvectors/aes_gcm_test.txt", + "src/third_party/wycheproof_testvectors/chacha20_poly1305_test.txt", + "src/third_party/wycheproof_testvectors/dsa_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp224r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp256r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp384r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp521r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp224r1_sha224_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp224r1_sha256_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp224r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp256r1_sha256_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp256r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp384r1_sha384_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp384r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp521r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/eddsa_test.txt", + "src/third_party/wycheproof_testvectors/kw_test.txt", + "src/third_party/wycheproof_testvectors/kwp_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_2048_sha1_mgf1_20_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_2048_sha256_mgf1_0_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_2048_sha256_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_3072_sha256_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_4096_sha256_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_4096_sha512_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_misc_test.txt", + "src/third_party/wycheproof_testvectors/rsa_signature_test.txt", + "src/third_party/wycheproof_testvectors/x25519_test.txt", +] + ssl_test_sources = [ + "src/crypto/test/abi_test.cc", "src/crypto/test/gtest_main.cc", "src/ssl/span_test.cc", + "src/ssl/ssl_c_test.c", "src/ssl/ssl_test.cc", ] diff --git a/packager/third_party/boringssl/boringssl.gyp b/packager/third_party/boringssl/boringssl.gyp index 75d0829685..cd5d88c834 100644 --- a/packager/third_party/boringssl/boringssl.gyp +++ b/packager/third_party/boringssl/boringssl.gyp @@ -175,15 +175,10 @@ 'sources': [ '<@(boringssl_linux_x86_64_sources)' ], }], ['OS == "win"', { - 'sources': [ '<@(boringssl_win_x86_64_sources)' ], - # Windows' assembly is built with Yasm. The other platforms use - # the platform assembler. - 'variables': { - 'yasm_output_path': '<(SHARED_INTERMEDIATE_DIR)/third_party/boringssl', + # NOTES(kqyang): Somehow ASM fails to compile. Disable ASM. + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], }, - 'includes': [ - '../yasm/yasm_compile.gypi', - ], }], ['OS != "mac" and OS != "linux" and OS != "win" and OS != "android"', { 'direct_dependent_settings': { diff --git a/packager/third_party/boringssl/boringssl.gypi b/packager/third_party/boringssl/boringssl.gypi index 17f7a6c5ab..e7cb8f0ba2 100644 --- a/packager/third_party/boringssl/boringssl.gypi +++ b/packager/third_party/boringssl/boringssl.gypi @@ -8,17 +8,18 @@ 'variables': { 'boringssl_ssl_sources': [ 'src/include/openssl/dtls1.h', + 'src/include/openssl/srtp.h', 'src/include/openssl/ssl.h', 'src/include/openssl/ssl3.h', 'src/include/openssl/tls1.h', 'src/ssl/bio_ssl.cc', - 'src/ssl/custom_extensions.cc', 'src/ssl/d1_both.cc', 'src/ssl/d1_lib.cc', 'src/ssl/d1_pkt.cc', 'src/ssl/d1_srtp.cc', 'src/ssl/dtls_method.cc', 'src/ssl/dtls_record.cc', + 'src/ssl/handoff.cc', 'src/ssl/handshake.cc', 'src/ssl/handshake_client.cc', 'src/ssl/handshake_server.cc', @@ -102,16 +103,18 @@ 'src/crypto/bytestring/cbb.c', 'src/crypto/bytestring/cbs.c', 'src/crypto/bytestring/internal.h', + 'src/crypto/bytestring/unicode.c', 'src/crypto/chacha/chacha.c', + 'src/crypto/chacha/internal.h', 'src/crypto/cipher_extra/cipher_extra.c', 'src/crypto/cipher_extra/derive_key.c', + 'src/crypto/cipher_extra/e_aesccm.c', 'src/crypto/cipher_extra/e_aesctrhmac.c', 'src/crypto/cipher_extra/e_aesgcmsiv.c', 'src/crypto/cipher_extra/e_chacha20poly1305.c', 'src/crypto/cipher_extra/e_null.c', 'src/crypto/cipher_extra/e_rc2.c', 'src/crypto/cipher_extra/e_rc4.c', - 'src/crypto/cipher_extra/e_ssl3.c', 'src/crypto/cipher_extra/e_tls.c', 'src/crypto/cipher_extra/internal.h', 'src/crypto/cipher_extra/tls_cbc.c', @@ -119,14 +122,15 @@ 'src/crypto/conf/conf.c', 'src/crypto/conf/conf_def.h', 'src/crypto/conf/internal.h', + 'src/crypto/cpu-aarch64-fuchsia.c', 'src/crypto/cpu-aarch64-linux.c', 'src/crypto/cpu-arm-linux.c', + 'src/crypto/cpu-arm-linux.h', 'src/crypto/cpu-arm.c', 'src/crypto/cpu-intel.c', 'src/crypto/cpu-ppc64le.c', 'src/crypto/crypto.c', 'src/crypto/curve25519/spake25519.c', - 'src/crypto/curve25519/x25519-x86_64.c', 'src/crypto/dh/check.c', 'src/crypto/dh/dh.c', 'src/crypto/dh/dh_asn1.c', @@ -135,7 +139,8 @@ 'src/crypto/dsa/dsa.c', 'src/crypto/dsa/dsa_asn1.c', 'src/crypto/ec_extra/ec_asn1.c', - 'src/crypto/ecdh/ecdh.c', + 'src/crypto/ec_extra/ec_derive.c', + 'src/crypto/ecdh_extra/ecdh_extra.c', 'src/crypto/ecdsa_extra/ecdsa_asn1.c', 'src/crypto/engine/engine.c', 'src/crypto/err/err.c', @@ -152,6 +157,8 @@ 'src/crypto/evp/p_ed25519_asn1.c', 'src/crypto/evp/p_rsa.c', 'src/crypto/evp/p_rsa_asn1.c', + 'src/crypto/evp/p_x25519.c', + 'src/crypto/evp/p_x25519_asn1.c', 'src/crypto/evp/pbkdf.c', 'src/crypto/evp/print.c', 'src/crypto/evp/scrypt.c', @@ -169,11 +176,17 @@ 'src/crypto/fipsmodule/ec/internal.h', 'src/crypto/fipsmodule/ec/p256-x86_64-table.h', 'src/crypto/fipsmodule/ec/p256-x86_64.h', + 'src/crypto/fipsmodule/fips_shared_support.c', 'src/crypto/fipsmodule/is_fips.c', + 'src/crypto/fipsmodule/md5/internal.h', 'src/crypto/fipsmodule/modes/internal.h', 'src/crypto/fipsmodule/rand/internal.h', 'src/crypto/fipsmodule/rsa/internal.h', + 'src/crypto/fipsmodule/sha/internal.h', + 'src/crypto/fipsmodule/tls/internal.h', 'src/crypto/hkdf/hkdf.c', + 'src/crypto/hrss/hrss.c', + 'src/crypto/hrss/internal.h', 'src/crypto/internal.h', 'src/crypto/lhash/lhash.c', 'src/crypto/mem.c', @@ -210,6 +223,8 @@ 'src/crypto/refcount_c11.c', 'src/crypto/refcount_lock.c', 'src/crypto/rsa_extra/rsa_asn1.c', + 'src/crypto/rsa_extra/rsa_print.c', + 'src/crypto/siphash/siphash.c', 'src/crypto/stack/stack.c', 'src/crypto/thread.c', 'src/crypto/thread_none.c', @@ -268,6 +283,7 @@ 'src/crypto/x509/x_x509.c', 'src/crypto/x509/x_x509a.c', 'src/crypto/x509v3/ext_dat.h', + 'src/crypto/x509v3/internal.h', 'src/crypto/x509v3/pcy_cache.c', 'src/crypto/x509v3/pcy_data.c', 'src/crypto/x509v3/pcy_int.h', @@ -291,6 +307,7 @@ 'src/crypto/x509v3/v3_int.c', 'src/crypto/x509v3/v3_lib.c', 'src/crypto/x509v3/v3_ncons.c', + 'src/crypto/x509v3/v3_ocsp.c', 'src/crypto/x509v3/v3_pci.c', 'src/crypto/x509v3/v3_pcia.c', 'src/crypto/x509v3/v3_pcons.c', @@ -327,6 +344,7 @@ 'src/include/openssl/dh.h', 'src/include/openssl/digest.h', 'src/include/openssl/dsa.h', + 'src/include/openssl/e_os2.h', 'src/include/openssl/ec.h', 'src/include/openssl/ec_key.h', 'src/include/openssl/ecdh.h', @@ -337,9 +355,9 @@ 'src/include/openssl/ex_data.h', 'src/include/openssl/hkdf.h', 'src/include/openssl/hmac.h', + 'src/include/openssl/hrss.h', 'src/include/openssl/is_boringssl.h', 'src/include/openssl/lhash.h', - 'src/include/openssl/lhash_macros.h', 'src/include/openssl/md4.h', 'src/include/openssl/md5.h', 'src/include/openssl/mem.h', @@ -362,8 +380,8 @@ 'src/include/openssl/rsa.h', 'src/include/openssl/safestack.h', 'src/include/openssl/sha.h', + 'src/include/openssl/siphash.h', 'src/include/openssl/span.h', - 'src/include/openssl/srtp.h', 'src/include/openssl/stack.h', 'src/include/openssl/thread.h', 'src/include/openssl/type_check.h', @@ -371,16 +389,34 @@ 'src/include/openssl/x509_vfy.h', 'src/include/openssl/x509v3.h', 'src/third_party/fiat/curve25519.c', + 'src/third_party/fiat/curve25519_32.h', + 'src/third_party/fiat/curve25519_64.h', + 'src/third_party/fiat/curve25519_tables.h', 'src/third_party/fiat/internal.h', + 'src/third_party/fiat/p256_32.h', + 'src/third_party/fiat/p256_64.h', + 'src/third_party/sike/asm/fp_generic.c', + 'src/third_party/sike/curve_params.c', + 'src/third_party/sike/fpx.c', + 'src/third_party/sike/fpx.h', + 'src/third_party/sike/isogeny.c', + 'src/third_party/sike/isogeny.h', + 'src/third_party/sike/sike.c', + 'src/third_party/sike/sike.h', + 'src/third_party/sike/utils.h', ], 'boringssl_ios_aarch64_sources': [ 'ios-aarch64/crypto/chacha/chacha-armv8.S', 'ios-aarch64/crypto/fipsmodule/aesv8-armx64.S', 'ios-aarch64/crypto/fipsmodule/armv8-mont.S', + 'ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S', 'ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S', 'ios-aarch64/crypto/fipsmodule/sha1-armv8.S', 'ios-aarch64/crypto/fipsmodule/sha256-armv8.S', 'ios-aarch64/crypto/fipsmodule/sha512-armv8.S', + 'ios-aarch64/crypto/fipsmodule/vpaes-armv8.S', + 'ios-aarch64/crypto/test/trampoline-armv8.S', + 'ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S', ], 'boringssl_ios_arm_sources': [ 'ios-arm/crypto/chacha/chacha-armv4.S', @@ -393,15 +429,21 @@ 'ios-arm/crypto/fipsmodule/sha1-armv4-large.S', 'ios-arm/crypto/fipsmodule/sha256-armv4.S', 'ios-arm/crypto/fipsmodule/sha512-armv4.S', + 'ios-arm/crypto/fipsmodule/vpaes-armv7.S', + 'ios-arm/crypto/test/trampoline-armv4.S', ], 'boringssl_linux_aarch64_sources': [ 'linux-aarch64/crypto/chacha/chacha-armv8.S', 'linux-aarch64/crypto/fipsmodule/aesv8-armx64.S', 'linux-aarch64/crypto/fipsmodule/armv8-mont.S', + 'linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S', 'linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S', 'linux-aarch64/crypto/fipsmodule/sha1-armv8.S', 'linux-aarch64/crypto/fipsmodule/sha256-armv8.S', 'linux-aarch64/crypto/fipsmodule/sha512-armv8.S', + 'linux-aarch64/crypto/fipsmodule/vpaes-armv8.S', + 'linux-aarch64/crypto/test/trampoline-armv8.S', + 'linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S', ], 'boringssl_linux_arm_sources': [ 'linux-arm/crypto/chacha/chacha-armv4.S', @@ -414,6 +456,8 @@ 'linux-arm/crypto/fipsmodule/sha1-armv4-large.S', 'linux-arm/crypto/fipsmodule/sha256-armv4.S', 'linux-arm/crypto/fipsmodule/sha512-armv4.S', + 'linux-arm/crypto/fipsmodule/vpaes-armv7.S', + 'linux-arm/crypto/test/trampoline-armv4.S', 'src/crypto/curve25519/asm/x25519-asm-arm.S', 'src/crypto/poly1305/poly1305_arm_asm.S', ], @@ -427,6 +471,7 @@ 'linux-x86/crypto/fipsmodule/aesni-x86.S', 'linux-x86/crypto/fipsmodule/bn-586.S', 'linux-x86/crypto/fipsmodule/co-586.S', + 'linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S', 'linux-x86/crypto/fipsmodule/ghash-x86.S', 'linux-x86/crypto/fipsmodule/md5-586.S', 'linux-x86/crypto/fipsmodule/sha1-586.S', @@ -434,6 +479,7 @@ 'linux-x86/crypto/fipsmodule/sha512-586.S', 'linux-x86/crypto/fipsmodule/vpaes-x86.S', 'linux-x86/crypto/fipsmodule/x86-mont.S', + 'linux-x86/crypto/test/trampoline-x86.S', ], 'boringssl_linux_x86_64_sources': [ 'linux-x86_64/crypto/chacha/chacha-x86_64.S', @@ -442,10 +488,11 @@ 'linux-x86_64/crypto/fipsmodule/aes-x86_64.S', 'linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S', 'linux-x86_64/crypto/fipsmodule/aesni-x86_64.S', - 'linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S', 'linux-x86_64/crypto/fipsmodule/ghash-x86_64.S', 'linux-x86_64/crypto/fipsmodule/md5-x86_64.S', 'linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S', + 'linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S', 'linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S', 'linux-x86_64/crypto/fipsmodule/rsaz-avx2.S', 'linux-x86_64/crypto/fipsmodule/sha1-x86_64.S', @@ -454,7 +501,9 @@ 'linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S', 'linux-x86_64/crypto/fipsmodule/x86_64-mont.S', 'linux-x86_64/crypto/fipsmodule/x86_64-mont5.S', - 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', + 'linux-x86_64/crypto/test/trampoline-x86_64.S', + 'linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S', + 'src/crypto/hrss/asm/poly_rq_mul.S', ], 'boringssl_mac_x86_sources': [ 'mac-x86/crypto/chacha/chacha-x86.S', @@ -462,6 +511,7 @@ 'mac-x86/crypto/fipsmodule/aesni-x86.S', 'mac-x86/crypto/fipsmodule/bn-586.S', 'mac-x86/crypto/fipsmodule/co-586.S', + 'mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S', 'mac-x86/crypto/fipsmodule/ghash-x86.S', 'mac-x86/crypto/fipsmodule/md5-586.S', 'mac-x86/crypto/fipsmodule/sha1-586.S', @@ -469,6 +519,7 @@ 'mac-x86/crypto/fipsmodule/sha512-586.S', 'mac-x86/crypto/fipsmodule/vpaes-x86.S', 'mac-x86/crypto/fipsmodule/x86-mont.S', + 'mac-x86/crypto/test/trampoline-x86.S', ], 'boringssl_mac_x86_64_sources': [ 'mac-x86_64/crypto/chacha/chacha-x86_64.S', @@ -477,10 +528,11 @@ 'mac-x86_64/crypto/fipsmodule/aes-x86_64.S', 'mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S', 'mac-x86_64/crypto/fipsmodule/aesni-x86_64.S', - 'mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S', 'mac-x86_64/crypto/fipsmodule/ghash-x86_64.S', 'mac-x86_64/crypto/fipsmodule/md5-x86_64.S', 'mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S', + 'mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S', 'mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S', 'mac-x86_64/crypto/fipsmodule/rsaz-avx2.S', 'mac-x86_64/crypto/fipsmodule/sha1-x86_64.S', @@ -489,7 +541,8 @@ 'mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S', 'mac-x86_64/crypto/fipsmodule/x86_64-mont.S', 'mac-x86_64/crypto/fipsmodule/x86_64-mont5.S', - 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', + 'mac-x86_64/crypto/test/trampoline-x86_64.S', + 'mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S', ], 'boringssl_win_x86_sources': [ 'win-x86/crypto/chacha/chacha-x86.asm', @@ -497,6 +550,7 @@ 'win-x86/crypto/fipsmodule/aesni-x86.asm', 'win-x86/crypto/fipsmodule/bn-586.asm', 'win-x86/crypto/fipsmodule/co-586.asm', + 'win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm', 'win-x86/crypto/fipsmodule/ghash-x86.asm', 'win-x86/crypto/fipsmodule/md5-586.asm', 'win-x86/crypto/fipsmodule/sha1-586.asm', @@ -504,6 +558,7 @@ 'win-x86/crypto/fipsmodule/sha512-586.asm', 'win-x86/crypto/fipsmodule/vpaes-x86.asm', 'win-x86/crypto/fipsmodule/x86-mont.asm', + 'win-x86/crypto/test/trampoline-x86.asm', ], 'boringssl_win_x86_64_sources': [ 'win-x86_64/crypto/chacha/chacha-x86_64.asm', @@ -512,10 +567,11 @@ 'win-x86_64/crypto/fipsmodule/aes-x86_64.asm', 'win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm', 'win-x86_64/crypto/fipsmodule/aesni-x86_64.asm', - 'win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm', 'win-x86_64/crypto/fipsmodule/ghash-x86_64.asm', 'win-x86_64/crypto/fipsmodule/md5-x86_64.asm', 'win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm', + 'win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm', 'win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm', 'win-x86_64/crypto/fipsmodule/rsaz-avx2.asm', 'win-x86_64/crypto/fipsmodule/sha1-x86_64.asm', @@ -524,6 +580,8 @@ 'win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm', 'win-x86_64/crypto/fipsmodule/x86_64-mont.asm', 'win-x86_64/crypto/fipsmodule/x86_64-mont5.asm', + 'win-x86_64/crypto/test/trampoline-x86_64.asm', + 'win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm', ], } } diff --git a/packager/third_party/boringssl/err_data.c b/packager/third_party/boringssl/err_data.c index 931a44b643..7fcbc56ac0 100644 --- a/packager/third_party/boringssl/err_data.c +++ b/packager/third_party/boringssl/err_data.c @@ -18,716 +18,159 @@ #include #include +OPENSSL_STATIC_ASSERT(ERR_LIB_NONE == 1, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_SYS == 2, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_BN == 3, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_RSA == 4, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_DH == 5, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_EVP == 6, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_BUF == 7, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_OBJ == 8, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_PEM == 9, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_DSA == 10, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_X509 == 11, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ASN1 == 12, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_CONF == 13, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_CRYPTO == 14, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_EC == 15, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_SSL == 16, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_BIO == 17, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_PKCS7 == 18, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_PKCS8 == 19, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_X509V3 == 20, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_RAND == 21, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ENGINE == 22, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_OCSP == 23, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_UI == 24, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_COMP == 25, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ECDSA == 26, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ECDH == 27, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_HMAC == 28, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_DIGEST == 29, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_CIPHER == 30, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_HKDF == 31, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_USER == 32, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_NUM_LIBS == 33, "number of libraries changed"); -OPENSSL_COMPILE_ASSERT(ERR_LIB_NONE == 1, library_values_changed_1); -OPENSSL_COMPILE_ASSERT(ERR_LIB_SYS == 2, library_values_changed_2); -OPENSSL_COMPILE_ASSERT(ERR_LIB_BN == 3, library_values_changed_3); -OPENSSL_COMPILE_ASSERT(ERR_LIB_RSA == 4, library_values_changed_4); -OPENSSL_COMPILE_ASSERT(ERR_LIB_DH == 5, library_values_changed_5); -OPENSSL_COMPILE_ASSERT(ERR_LIB_EVP == 6, library_values_changed_6); -OPENSSL_COMPILE_ASSERT(ERR_LIB_BUF == 7, library_values_changed_7); -OPENSSL_COMPILE_ASSERT(ERR_LIB_OBJ == 8, library_values_changed_8); -OPENSSL_COMPILE_ASSERT(ERR_LIB_PEM == 9, library_values_changed_9); -OPENSSL_COMPILE_ASSERT(ERR_LIB_DSA == 10, library_values_changed_10); -OPENSSL_COMPILE_ASSERT(ERR_LIB_X509 == 11, library_values_changed_11); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ASN1 == 12, library_values_changed_12); -OPENSSL_COMPILE_ASSERT(ERR_LIB_CONF == 13, library_values_changed_13); -OPENSSL_COMPILE_ASSERT(ERR_LIB_CRYPTO == 14, library_values_changed_14); -OPENSSL_COMPILE_ASSERT(ERR_LIB_EC == 15, library_values_changed_15); -OPENSSL_COMPILE_ASSERT(ERR_LIB_SSL == 16, library_values_changed_16); -OPENSSL_COMPILE_ASSERT(ERR_LIB_BIO == 17, library_values_changed_17); -OPENSSL_COMPILE_ASSERT(ERR_LIB_PKCS7 == 18, library_values_changed_18); -OPENSSL_COMPILE_ASSERT(ERR_LIB_PKCS8 == 19, library_values_changed_19); -OPENSSL_COMPILE_ASSERT(ERR_LIB_X509V3 == 20, library_values_changed_20); -OPENSSL_COMPILE_ASSERT(ERR_LIB_RAND == 21, library_values_changed_21); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ENGINE == 22, library_values_changed_22); -OPENSSL_COMPILE_ASSERT(ERR_LIB_OCSP == 23, library_values_changed_23); -OPENSSL_COMPILE_ASSERT(ERR_LIB_UI == 24, library_values_changed_24); -OPENSSL_COMPILE_ASSERT(ERR_LIB_COMP == 25, library_values_changed_25); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ECDSA == 26, library_values_changed_26); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ECDH == 27, library_values_changed_27); -OPENSSL_COMPILE_ASSERT(ERR_LIB_HMAC == 28, library_values_changed_28); -OPENSSL_COMPILE_ASSERT(ERR_LIB_DIGEST == 29, library_values_changed_29); -OPENSSL_COMPILE_ASSERT(ERR_LIB_CIPHER == 30, library_values_changed_30); -OPENSSL_COMPILE_ASSERT(ERR_LIB_HKDF == 31, library_values_changed_31); -OPENSSL_COMPILE_ASSERT(ERR_LIB_USER == 32, library_values_changed_32); -OPENSSL_COMPILE_ASSERT(ERR_NUM_LIBS == 33, library_values_changed_num); - -// clang-format off const uint32_t kOpenSSLReasonValues[] = { - 0xc320838, - 0xc328852, - 0xc330861, - 0xc338871, - 0xc340880, - 0xc348899, - 0xc3508a5, - 0xc3588c2, - 0xc3608e2, - 0xc3688f0, - 0xc370900, - 0xc37890d, - 0xc38091d, - 0xc388928, - 0xc39093e, - 0xc39894d, - 0xc3a0961, - 0xc3a8845, - 0xc3b00ea, - 0xc3b88d4, - 0x10320845, - 0x10329535, - 0x10331541, - 0x1033955a, - 0x1034156d, - 0x10348efc, - 0x10350c5e, - 0x10359580, - 0x10361595, - 0x103695a8, - 0x103715c7, - 0x103795e0, - 0x103815f5, - 0x10389613, - 0x10391622, - 0x1039963e, - 0x103a1659, - 0x103a9668, - 0x103b1684, - 0x103b969f, - 0x103c16b6, - 0x103c80ea, - 0x103d16c7, - 0x103d96db, - 0x103e16fa, - 0x103e9709, - 0x103f1720, - 0x103f9733, - 0x10400c22, - 0x10409746, - 0x10411764, - 0x10419777, - 0x10421791, - 0x104297a1, - 0x104317b5, - 0x104397cb, - 0x104417e3, - 0x104497f8, - 0x1045180c, - 0x1045981e, - 0x104605fb, - 0x1046894d, - 0x10471833, - 0x1047984a, - 0x1048185f, - 0x1048986d, - 0x10490e5e, - 0x14320c05, - 0x14328c13, - 0x14330c22, - 0x14338c34, - 0x143400ac, - 0x143480ea, - 0x18320083, - 0x18328f52, - 0x183300ac, - 0x18338f68, - 0x18340f7c, - 0x183480ea, - 0x18350f91, - 0x18358fa9, - 0x18360fbe, - 0x18368fd2, - 0x18370ff6, - 0x1837900c, - 0x18381020, - 0x18389030, - 0x18390a73, - 0x18399040, - 0x183a1068, - 0x183a908e, - 0x183b0c6a, - 0x183b90c3, - 0x183c10d5, - 0x183c90e0, - 0x183d10f0, - 0x183d9101, - 0x183e1112, - 0x183e9124, - 0x183f114d, - 0x183f9166, - 0x1840117e, - 0x184086d3, - 0x184110b1, - 0x1841907c, - 0x1842109b, - 0x18429055, - 0x203211b8, - 0x203291a5, - 0x243211c4, - 0x24328993, - 0x243311d6, - 0x243391e3, - 0x243411f0, - 0x24349202, - 0x24351211, - 0x2435922e, - 0x2436123b, - 0x24369249, - 0x24371257, - 0x24379265, - 0x2438126e, - 0x2438927b, - 0x2439128e, - 0x28320c52, - 0x28328c6a, - 0x28330c22, - 0x28338c7d, - 0x28340c5e, - 0x283480ac, - 0x283500ea, - 0x2c322c6c, - 0x2c3292a5, - 0x2c332c7a, - 0x2c33ac8c, - 0x2c342ca0, - 0x2c34acb2, - 0x2c352ccd, - 0x2c35acdf, - 0x2c362cf2, - 0x2c36832d, - 0x2c372cff, - 0x2c37ad11, - 0x2c382d36, - 0x2c38ad4d, - 0x2c392d5b, - 0x2c39ad6b, - 0x2c3a2d7d, - 0x2c3aad91, - 0x2c3b2da2, - 0x2c3badc1, - 0x2c3c12b7, - 0x2c3c92cd, - 0x2c3d2dd5, - 0x2c3d92e6, - 0x2c3e2df2, - 0x2c3eae00, - 0x2c3f2e18, - 0x2c3fae30, - 0x2c402e3d, - 0x2c4091b8, - 0x2c412e4e, - 0x2c41ae61, - 0x2c42117e, - 0x2c42ae72, - 0x2c430720, - 0x2c43adb3, - 0x2c442d24, - 0x30320000, - 0x30328015, - 0x3033001f, - 0x30338038, - 0x3034004a, - 0x30348064, - 0x3035006b, - 0x30358083, - 0x30360094, - 0x303680ac, - 0x303700b9, - 0x303780c8, - 0x303800ea, - 0x303880f7, - 0x3039010a, - 0x30398125, - 0x303a013a, - 0x303a814e, - 0x303b0162, - 0x303b8173, - 0x303c018c, - 0x303c81a9, - 0x303d01b7, - 0x303d81cb, - 0x303e01db, - 0x303e81f4, - 0x303f0204, - 0x303f8217, - 0x30400226, - 0x30408232, - 0x30410247, - 0x30418257, - 0x3042026e, - 0x3042827b, - 0x3043028e, - 0x3043829d, - 0x304402b2, - 0x304482d3, - 0x304502e6, - 0x304582f9, - 0x30460312, - 0x3046832d, - 0x3047034a, - 0x30478363, - 0x30480371, - 0x30488382, - 0x30490391, - 0x304983a9, - 0x304a03bb, - 0x304a83cf, - 0x304b03ee, - 0x304b8401, - 0x304c040c, - 0x304c841d, - 0x304d0429, - 0x304d843f, - 0x304e044d, - 0x304e8463, - 0x304f0475, - 0x304f8487, - 0x3050049a, - 0x305084ad, - 0x305104be, - 0x305184ce, - 0x305204e6, - 0x305284fb, - 0x30530513, - 0x30538527, - 0x3054053f, - 0x30548558, - 0x30550571, - 0x3055858e, - 0x30560599, - 0x305685b1, - 0x305705c1, - 0x305785d2, - 0x305805e5, - 0x305885fb, - 0x30590604, - 0x30598619, - 0x305a062c, - 0x305a863b, - 0x305b065b, - 0x305b866a, - 0x305c068b, - 0x305c86a7, - 0x305d06b3, - 0x305d86d3, - 0x305e06ef, - 0x305e8700, - 0x305f0716, - 0x305f8720, - 0x34320b63, - 0x34328b77, - 0x34330b94, - 0x34338ba7, - 0x34340bb6, - 0x34348bef, - 0x34350bd3, - 0x3c320083, - 0x3c328ca7, - 0x3c330cc0, - 0x3c338cdb, - 0x3c340cf8, - 0x3c348d22, - 0x3c350d3d, - 0x3c358d63, - 0x3c360d7c, - 0x3c368d94, - 0x3c370da5, - 0x3c378db3, - 0x3c380dc0, - 0x3c388dd4, - 0x3c390c6a, - 0x3c398df7, - 0x3c3a0e0b, - 0x3c3a890d, - 0x3c3b0e1b, - 0x3c3b8e36, - 0x3c3c0e48, - 0x3c3c8e7b, - 0x3c3d0e85, - 0x3c3d8e99, - 0x3c3e0ea7, - 0x3c3e8ecc, - 0x3c3f0c93, - 0x3c3f8eb5, - 0x3c4000ac, - 0x3c4080ea, - 0x3c410d13, - 0x3c418d52, - 0x3c420e5e, - 0x3c428de8, - 0x403218c6, - 0x403298dc, - 0x4033190a, - 0x40339914, - 0x4034192b, - 0x40349949, - 0x40351959, - 0x4035996b, - 0x40361978, - 0x40369984, - 0x40371999, - 0x403799ab, - 0x403819b6, - 0x403899c8, - 0x40390efc, - 0x403999d8, - 0x403a19eb, - 0x403a9a0c, - 0x403b1a1d, - 0x403b9a2d, - 0x403c0064, - 0x403c8083, - 0x403d1ab1, - 0x403d9ac7, - 0x403e1ad6, - 0x403e9b0e, - 0x403f1b28, - 0x403f9b36, - 0x40401b4b, - 0x40409b5f, - 0x40411b7c, - 0x40419b97, - 0x40421bb0, - 0x40429bc3, - 0x40431bd7, - 0x40439bef, - 0x40441c06, - 0x404480ac, - 0x40451c1b, - 0x40459c2d, - 0x40461c51, - 0x40469c71, - 0x40471c7f, - 0x40479ca6, - 0x40481ce3, - 0x40489d16, - 0x40491d2d, - 0x40499d47, - 0x404a1d5e, - 0x404a9d7c, - 0x404b1d94, - 0x404b9dab, - 0x404c1dc1, - 0x404c9dd3, - 0x404d1df4, - 0x404d9e16, - 0x404e1e2a, - 0x404e9e37, - 0x404f1e64, - 0x404f9e8d, - 0x40501ec8, - 0x40509edc, - 0x40511ef7, - 0x40521f07, - 0x40529f2b, - 0x40531f43, - 0x40539f56, - 0x40541f6b, - 0x40549f8e, - 0x40551f9c, - 0x40559fb9, - 0x40561fc6, - 0x40569fdf, - 0x40571ff7, - 0x4057a00a, - 0x4058201f, - 0x4058a046, - 0x40592075, - 0x4059a0a2, - 0x405a20b6, - 0x405aa0c6, - 0x405b20de, - 0x405ba0ef, - 0x405c2102, - 0x405ca141, - 0x405d214e, - 0x405da165, - 0x405e21a3, - 0x405e8ab1, - 0x405f21c4, - 0x405fa1d1, - 0x406021df, - 0x4060a201, - 0x40612245, - 0x4061a27d, - 0x40622294, - 0x4062a2a5, - 0x406322b6, - 0x4063a2cb, - 0x406422e2, - 0x4064a30e, - 0x40652329, - 0x4065a340, - 0x40662358, - 0x4066a382, - 0x406723ad, - 0x4067a3ce, - 0x406823f5, - 0x4068a416, - 0x40692448, - 0x4069a476, - 0x406a2497, - 0x406aa4b7, - 0x406b263f, - 0x406ba662, - 0x406c2678, - 0x406ca8f3, - 0x406d2922, - 0x406da94a, - 0x406e2978, - 0x406ea9c5, - 0x406f29e4, - 0x406faa1c, - 0x40702a2f, - 0x4070aa4c, - 0x40710800, - 0x4071aa5e, - 0x40722a71, - 0x4072aa8a, - 0x40732aa2, - 0x407394a4, - 0x40742ab6, - 0x4074aad0, - 0x40752ae1, - 0x4075aaf5, - 0x40762b03, - 0x4076927b, - 0x40772b28, - 0x4077ab4a, - 0x40782b65, - 0x4078ab9e, - 0x40792bb5, - 0x4079abcb, - 0x407a2bd7, - 0x407aabea, - 0x407b2bff, - 0x407bac11, - 0x407c2c42, - 0x407cac4b, - 0x407d2431, - 0x407d9e9d, - 0x407e2b7a, - 0x407ea056, - 0x407f1c93, - 0x407f9a53, - 0x40801e74, - 0x40809cbb, - 0x40811f19, - 0x40819e4e, - 0x40822963, - 0x40829a39, - 0x40832031, - 0x4083a2f3, - 0x40841ccf, - 0x4084a08e, - 0x40852113, - 0x4085a229, - 0x40862185, - 0x40869eb7, - 0x408729a9, - 0x4087a25a, - 0x40881a9a, - 0x4088a3e1, - 0x40891ae9, - 0x40899a76, - 0x408a2698, - 0x408a9884, - 0x408b2c26, - 0x408ba9f9, - 0x408c2123, - 0x408c98a0, - 0x408d1cfc, - 0x41f4256a, - 0x41f925fc, - 0x41fe24ef, - 0x41fea6e4, - 0x41ff27d5, - 0x42032583, - 0x420825a5, - 0x4208a5e1, - 0x420924d3, - 0x4209a61b, - 0x420a252a, - 0x420aa50a, - 0x420b254a, - 0x420ba5c3, - 0x420c27f1, - 0x420ca6b1, - 0x420d26cb, - 0x420da702, - 0x4212271c, - 0x421727b8, - 0x4217a75e, - 0x421c2780, - 0x421f273b, - 0x42212808, - 0x4226279b, - 0x422b28d7, - 0x422ba885, - 0x422c28bf, - 0x422ca844, - 0x422d2823, - 0x422da8a4, - 0x422e286a, - 0x422ea990, - 0x4432072b, - 0x4432873a, - 0x44330746, - 0x44338754, - 0x44340767, - 0x44348778, - 0x4435077f, - 0x44358789, - 0x4436079c, - 0x443687b2, - 0x443707c4, - 0x443787d1, - 0x443807e0, - 0x443887e8, - 0x44390800, - 0x4439880e, - 0x443a0821, - 0x483212a5, - 0x483292b7, - 0x483312cd, - 0x483392e6, - 0x4c32130b, - 0x4c32931b, - 0x4c33132e, - 0x4c33934e, - 0x4c3400ac, - 0x4c3480ea, - 0x4c35135a, - 0x4c359368, - 0x4c361384, - 0x4c369397, - 0x4c3713a6, - 0x4c3793b4, - 0x4c3813c9, - 0x4c3893d5, - 0x4c3913f5, - 0x4c39941f, - 0x4c3a1438, - 0x4c3a9451, - 0x4c3b05fb, - 0x4c3b946a, - 0x4c3c147c, - 0x4c3c948b, - 0x4c3d14a4, - 0x4c3d8c45, - 0x4c3e14fd, - 0x4c3e94b3, - 0x4c3f151f, - 0x4c3f927b, - 0x4c4014c9, - 0x4c4092f7, - 0x4c4114ed, - 0x50322e84, - 0x5032ae93, - 0x50332e9e, - 0x5033aeae, - 0x50342ec7, - 0x5034aee1, - 0x50352eef, - 0x5035af05, - 0x50362f17, - 0x5036af2d, - 0x50372f46, - 0x5037af59, - 0x50382f71, - 0x5038af82, - 0x50392f97, - 0x5039afab, - 0x503a2fcb, - 0x503aafe1, - 0x503b2ff9, - 0x503bb00b, - 0x503c3027, - 0x503cb03e, - 0x503d3057, - 0x503db06d, - 0x503e307a, - 0x503eb090, - 0x503f30a2, - 0x503f8382, - 0x504030b5, - 0x5040b0c5, - 0x504130df, - 0x5041b0ee, - 0x50423108, - 0x5042b125, - 0x50433135, - 0x5043b145, - 0x50443154, - 0x5044843f, - 0x50453168, - 0x5045b186, - 0x50463199, - 0x5046b1af, - 0x504731c1, - 0x5047b1d6, - 0x504831fc, - 0x5048b20a, - 0x5049321d, - 0x5049b232, - 0x504a3248, - 0x504ab258, - 0x504b3278, - 0x504bb28b, - 0x504c32ae, - 0x504cb2dc, - 0x504d32ee, - 0x504db30b, - 0x504e3326, - 0x504eb342, - 0x504f3354, - 0x504fb36b, - 0x5050337a, - 0x505086ef, - 0x5051338d, - 0x58320f3a, - 0x68320efc, - 0x68328c6a, - 0x68330c7d, - 0x68338f0a, - 0x68340f1a, - 0x683480ea, - 0x6c320ed8, - 0x6c328c34, - 0x6c330ee3, - 0x74320a19, - 0x743280ac, - 0x74330c45, - 0x7832097e, - 0x78328993, - 0x7833099f, - 0x78338083, - 0x783409ae, - 0x783489c3, - 0x783509e2, - 0x78358a04, - 0x78360a19, - 0x78368a2f, - 0x78370a3f, - 0x78378a60, - 0x78380a73, - 0x78388a85, - 0x78390a92, - 0x78398ab1, - 0x783a0ac6, - 0x783a8ad4, - 0x783b0ade, - 0x783b8af2, - 0x783c0b09, - 0x783c8b1e, - 0x783d0b35, - 0x783d8b4a, - 0x783e0aa0, - 0x783e8a52, - 0x7c321194, + 0xc32083a, 0xc328854, 0xc330863, 0xc338873, 0xc340882, 0xc34889b, + 0xc3508a7, 0xc3588c4, 0xc3608e4, 0xc3688f2, 0xc370902, 0xc37890f, + 0xc38091f, 0xc38892a, 0xc390940, 0xc39894f, 0xc3a0963, 0xc3a8847, + 0xc3b00ea, 0xc3b88d6, 0x10320847, 0x1032959f, 0x103315ab, 0x103395c4, + 0x103415d7, 0x10348f27, 0x10350c60, 0x103595ea, 0x10361614, 0x10369627, + 0x10371646, 0x1037965f, 0x10381674, 0x10389692, 0x103916a1, 0x103996bd, + 0x103a16d8, 0x103a96e7, 0x103b1703, 0x103b971e, 0x103c1744, 0x103c80ea, + 0x103d1755, 0x103d9769, 0x103e1788, 0x103e9797, 0x103f17ae, 0x103f97c1, + 0x10400c24, 0x104097d4, 0x104117f2, 0x10419805, 0x1042181f, 0x1042982f, + 0x10431843, 0x10439859, 0x10441871, 0x10449886, 0x1045189a, 0x104598ac, + 0x104605fd, 0x1046894f, 0x104718c1, 0x104798d8, 0x104818ed, 0x104898fb, + 0x10490e73, 0x10499735, 0x104a15ff, 0x14320c07, 0x14328c15, 0x14330c24, + 0x14338c36, 0x143400ac, 0x143480ea, 0x18320083, 0x18328f7d, 0x183300ac, + 0x18338f93, 0x18340fa7, 0x183480ea, 0x18350fbc, 0x18358fd4, 0x18360fe9, + 0x18368ffd, 0x18371021, 0x18379037, 0x1838104b, 0x1838905b, 0x18390a75, + 0x1839906b, 0x183a1091, 0x183a90b7, 0x183b0c7f, 0x183b9106, 0x183c1118, + 0x183c9123, 0x183d1133, 0x183d9144, 0x183e1155, 0x183e9167, 0x183f1190, + 0x183f91a9, 0x184011c1, 0x184086d5, 0x184110da, 0x184190a5, 0x184210c4, + 0x18428c6c, 0x18431080, 0x184390ec, 0x203211fb, 0x203291e8, 0x24321207, + 0x24328995, 0x24331219, 0x24339226, 0x24341233, 0x24349245, 0x24351254, + 0x24359271, 0x2436127e, 0x2436928c, 0x2437129a, 0x243792a8, 0x243812b1, + 0x243892be, 0x243912d1, 0x28320c54, 0x28328c7f, 0x28330c24, 0x28338c92, + 0x28340c60, 0x283480ac, 0x283500ea, 0x28358c6c, 0x2c322f0c, 0x2c3292e8, + 0x2c332f1a, 0x2c33af2c, 0x2c342f40, 0x2c34af52, 0x2c352f6d, 0x2c35af7f, + 0x2c362f92, 0x2c36832d, 0x2c372f9f, 0x2c37afb1, 0x2c382fd6, 0x2c38afed, + 0x2c392ffb, 0x2c39b00b, 0x2c3a301d, 0x2c3ab031, 0x2c3b3042, 0x2c3bb061, + 0x2c3c12fa, 0x2c3c9310, 0x2c3d3075, 0x2c3d9329, 0x2c3e3092, 0x2c3eb0a0, + 0x2c3f30b8, 0x2c3fb0d0, 0x2c4030fa, 0x2c4091fb, 0x2c41310b, 0x2c41b11e, + 0x2c4211c1, 0x2c42b12f, 0x2c430722, 0x2c43b053, 0x2c442fc4, 0x2c44b0dd, + 0x30320000, 0x30328015, 0x3033001f, 0x30338038, 0x3034004a, 0x30348064, + 0x3035006b, 0x30358083, 0x30360094, 0x303680ac, 0x303700b9, 0x303780c8, + 0x303800ea, 0x303880f7, 0x3039010a, 0x30398125, 0x303a013a, 0x303a814e, + 0x303b0162, 0x303b8173, 0x303c018c, 0x303c81a9, 0x303d01b7, 0x303d81cb, + 0x303e01db, 0x303e81f4, 0x303f0204, 0x303f8217, 0x30400226, 0x30408232, + 0x30410247, 0x30418257, 0x3042026e, 0x3042827b, 0x3043028e, 0x3043829d, + 0x304402b2, 0x304482d3, 0x304502e6, 0x304582f9, 0x30460312, 0x3046832d, + 0x3047034a, 0x3047835c, 0x3048036a, 0x3048837b, 0x3049038a, 0x304983a2, + 0x304a03b4, 0x304a83c8, 0x304b03e0, 0x304b83f3, 0x304c03fe, 0x304c840f, + 0x304d041b, 0x304d8431, 0x304e043f, 0x304e8455, 0x304f0467, 0x304f8479, + 0x3050049c, 0x305084af, 0x305104c0, 0x305184d0, 0x305204e8, 0x305284fd, + 0x30530515, 0x30538529, 0x30540541, 0x3054855a, 0x30550573, 0x30558590, + 0x3056059b, 0x305685b3, 0x305705c3, 0x305785d4, 0x305805e7, 0x305885fd, + 0x30590606, 0x3059861b, 0x305a062e, 0x305a863d, 0x305b065d, 0x305b866c, + 0x305c068d, 0x305c86a9, 0x305d06b5, 0x305d86d5, 0x305e06f1, 0x305e8702, + 0x305f0718, 0x305f8722, 0x3060048c, 0x34320b65, 0x34328b79, 0x34330b96, + 0x34338ba9, 0x34340bb8, 0x34348bf1, 0x34350bd5, 0x3c320083, 0x3c328cbc, + 0x3c330cd5, 0x3c338cf0, 0x3c340d0d, 0x3c348d37, 0x3c350d52, 0x3c358d78, + 0x3c360d91, 0x3c368da9, 0x3c370dba, 0x3c378dc8, 0x3c380dd5, 0x3c388de9, + 0x3c390c7f, 0x3c398e0c, 0x3c3a0e20, 0x3c3a890f, 0x3c3b0e30, 0x3c3b8e4b, + 0x3c3c0e5d, 0x3c3c8e90, 0x3c3d0e9a, 0x3c3d8eae, 0x3c3e0ebc, 0x3c3e8ee1, + 0x3c3f0ca8, 0x3c3f8eca, 0x3c4000ac, 0x3c4080ea, 0x3c410d28, 0x3c418d67, + 0x3c420e73, 0x3c428dfd, 0x40321971, 0x40329987, 0x403319b5, 0x403399bf, + 0x403419d6, 0x403499f4, 0x40351a04, 0x40359a16, 0x40361a23, 0x40369a2f, + 0x40371a44, 0x40379a56, 0x40381a61, 0x40389a73, 0x40390f27, 0x40399a83, + 0x403a1a96, 0x403a9ab7, 0x403b1ac8, 0x403b9ad8, 0x403c0064, 0x403c8083, + 0x403d1b5c, 0x403d9b72, 0x403e1b81, 0x403e9bb9, 0x403f1bd3, 0x403f9bfb, + 0x40401c10, 0x40409c24, 0x40411c41, 0x40419c5c, 0x40421c75, 0x40429c88, + 0x40431c9c, 0x40439cb4, 0x40441ccb, 0x404480ac, 0x40451ce0, 0x40459cf2, + 0x40461d16, 0x40469d36, 0x40471d44, 0x40479d6b, 0x40481ddc, 0x40489e0f, + 0x40491e26, 0x40499e40, 0x404a1e57, 0x404a9e75, 0x404b1e8d, 0x404b9ea4, + 0x404c1eba, 0x404c9ecc, 0x404d1eed, 0x404d9f26, 0x404e1f3a, 0x404e9f47, + 0x404f1f8e, 0x404f9fd4, 0x4050202b, 0x4050a03f, 0x40512072, 0x40522082, + 0x4052a0a6, 0x405320be, 0x4053a0d1, 0x405420e6, 0x4054a109, 0x40552117, + 0x4055a154, 0x40562161, 0x4056a17a, 0x40572192, 0x4057a1a5, 0x405821ba, + 0x4058a1e1, 0x40592210, 0x4059a23d, 0x405a2251, 0x405aa261, 0x405b2279, + 0x405ba28a, 0x405c229d, 0x405ca2dc, 0x405d22e9, 0x405da30e, 0x405e234c, + 0x405e8ab3, 0x405f236d, 0x405fa37a, 0x40602388, 0x4060a3aa, 0x4061240b, + 0x4061a443, 0x4062245a, 0x4062a46b, 0x40632490, 0x4063a4a5, 0x406424bc, + 0x4064a4e8, 0x40652503, 0x4065a51a, 0x40662532, 0x4066a55c, 0x40672587, + 0x4067a5cc, 0x40682614, 0x4068a635, 0x40692667, 0x4069a695, 0x406a26b6, + 0x406aa6d6, 0x406b285e, 0x406ba881, 0x406c2897, 0x406cab3a, 0x406d2b69, + 0x406dab91, 0x406e2bbf, 0x406eac0c, 0x406f2c47, 0x406fac7f, 0x40702c92, + 0x4070acaf, 0x40710802, 0x4071acc1, 0x40722cd4, 0x4072ad0a, 0x40732d22, + 0x407394fa, 0x40742d36, 0x4074ad50, 0x40752d61, 0x4075ad75, 0x40762d83, + 0x407692be, 0x40772da8, 0x4077adca, 0x40782de5, 0x4078ae1e, 0x40792e35, + 0x4079ae4b, 0x407a2e77, 0x407aae8a, 0x407b2e9f, 0x407baeb1, 0x407c2ee2, + 0x407caeeb, 0x407d2650, 0x407d9fe4, 0x407e2dfa, 0x407ea1f1, 0x407f1d58, + 0x407f9afe, 0x40801f9e, 0x40809d80, 0x40812094, 0x40819f78, 0x40822baa, + 0x40829ae4, 0x408321cc, 0x4083a4cd, 0x40841d94, 0x4084a229, 0x408522ae, + 0x4085a3d2, 0x4086232e, 0x40869ffe, 0x40872bf0, 0x4087a420, 0x40881b45, + 0x4088a5df, 0x40891b94, 0x40899b21, 0x408a28cf, 0x408a9912, 0x408b2ec6, + 0x408bac5c, 0x408c22be, 0x408c992e, 0x408d1df5, 0x408d9dc6, 0x408e1f0f, + 0x408ea134, 0x408f25f3, 0x408fa3ee, 0x409025a8, 0x4090a300, 0x409128b7, + 0x40919954, 0x40921be1, 0x4092ac2b, 0x40932ced, 0x4093a00f, 0x40941da8, + 0x4094a8e8, 0x4095247c, 0x4095ae57, 0x40962bd7, 0x40969fb7, 0x4097205a, + 0x40979f5e, 0x41f42789, 0x41f9281b, 0x41fe270e, 0x41fea92b, 0x41ff2a1c, + 0x420327a2, 0x420827c4, 0x4208a800, 0x420926f2, 0x4209a83a, 0x420a2749, + 0x420aa729, 0x420b2769, 0x420ba7e2, 0x420c2a38, 0x420ca8f8, 0x420d2912, + 0x420da949, 0x42122963, 0x421729ff, 0x4217a9a5, 0x421c29c7, 0x421f2982, + 0x42212a4f, 0x422629e2, 0x422b2b1e, 0x422baacc, 0x422c2b06, 0x422caa8b, + 0x422d2a6a, 0x422daaeb, 0x422e2ab1, 0x4432072d, 0x4432873c, 0x44330748, + 0x44338756, 0x44340769, 0x4434877a, 0x44350781, 0x4435878b, 0x4436079e, + 0x443687b4, 0x443707c6, 0x443787d3, 0x443807e2, 0x443887ea, 0x44390802, + 0x44398810, 0x443a0823, 0x483212e8, 0x483292fa, 0x48331310, 0x48339329, + 0x4c32134e, 0x4c32935e, 0x4c331371, 0x4c339391, 0x4c3400ac, 0x4c3480ea, + 0x4c35139d, 0x4c3593ab, 0x4c3613c7, 0x4c3693ed, 0x4c3713fc, 0x4c37940a, + 0x4c38141f, 0x4c38942b, 0x4c39144b, 0x4c399475, 0x4c3a148e, 0x4c3a94a7, + 0x4c3b05fd, 0x4c3b94c0, 0x4c3c14d2, 0x4c3c94e1, 0x4c3d14fa, 0x4c3d8c47, + 0x4c3e1567, 0x4c3e9509, 0x4c3f1589, 0x4c3f92be, 0x4c40151f, 0x4c40933a, + 0x4c411557, 0x4c4193da, 0x4c421543, 0x50323141, 0x5032b150, 0x5033315b, + 0x5033b16b, 0x50343184, 0x5034b19e, 0x503531ac, 0x5035b1c2, 0x503631d4, + 0x5036b1ea, 0x50373203, 0x5037b216, 0x5038322e, 0x5038b23f, 0x50393254, + 0x5039b268, 0x503a3288, 0x503ab29e, 0x503b32b6, 0x503bb2c8, 0x503c32e4, + 0x503cb2fb, 0x503d3314, 0x503db32a, 0x503e3337, 0x503eb34d, 0x503f335f, + 0x503f837b, 0x50403372, 0x5040b382, 0x5041339c, 0x5041b3ab, 0x504233c5, + 0x5042b3e2, 0x504333f2, 0x5043b402, 0x50443411, 0x50448431, 0x50453425, + 0x5045b443, 0x50463456, 0x5046b46c, 0x5047347e, 0x5047b493, 0x504834b9, + 0x5048b4c7, 0x504934da, 0x5049b4ef, 0x504a3505, 0x504ab515, 0x504b3535, + 0x504bb548, 0x504c356b, 0x504cb599, 0x504d35ab, 0x504db5c8, 0x504e35e3, + 0x504eb5ff, 0x504f3611, 0x504fb628, 0x50503637, 0x505086f1, 0x5051364a, + 0x58320f65, 0x68320f27, 0x68328c7f, 0x68330c92, 0x68338f35, 0x68340f45, + 0x683480ea, 0x6c320eed, 0x6c328c36, 0x6c330ef8, 0x6c338f11, 0x74320a1b, + 0x743280ac, 0x74330c47, 0x78320980, 0x78328995, 0x783309a1, 0x78338083, + 0x783409b0, 0x783489c5, 0x783509e4, 0x78358a06, 0x78360a1b, 0x78368a31, + 0x78370a41, 0x78378a62, 0x78380a75, 0x78388a87, 0x78390a94, 0x78398ab3, + 0x783a0ac8, 0x783a8ad6, 0x783b0ae0, 0x783b8af4, 0x783c0b0b, 0x783c8b20, + 0x783d0b37, 0x783d8b4c, 0x783e0aa2, 0x783e8a54, 0x7c3211d7, }; -// clang-format on const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); @@ -774,14 +217,14 @@ const char kOpenSSLReasonStringData[] = "INTEGER_NOT_ASCII_FORMAT\0" "INTEGER_TOO_LARGE_FOR_LONG\0" "INVALID_BIT_STRING_BITS_LEFT\0" - "INVALID_BMPSTRING_LENGTH\0" + "INVALID_BMPSTRING\0" "INVALID_DIGIT\0" "INVALID_MODIFIER\0" "INVALID_NUMBER\0" "INVALID_OBJECT_ENCODING\0" "INVALID_SEPARATOR\0" "INVALID_TIME_FORMAT\0" - "INVALID_UNIVERSALSTRING_LENGTH\0" + "INVALID_UNIVERSALSTRING\0" "INVALID_UTF8STRING\0" "LIST_ERROR\0" "MISSING_ASN1_EOS\0" @@ -792,6 +235,7 @@ const char kOpenSSLReasonStringData[] = "MSTRING_WRONG_TAG\0" "NESTED_ASN1_ERROR\0" "NESTED_ASN1_STRING\0" + "NESTED_TOO_DEEP\0" "NON_HEX_CHARACTERS\0" "NOT_ASCII_FORMAT\0" "NOT_ENOUGH_DATA\0" @@ -899,6 +343,7 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_HASH\0" "BAD_Q_VALUE\0" "BAD_VERSION\0" + "INVALID_PARAMETERS\0" "MISSING_PARAMETERS\0" "NEED_NEW_SETUP_VALUES\0" "BIGNUM_OUT_OF_RANGE\0" @@ -932,6 +377,7 @@ const char kOpenSSLReasonStringData[] = "WRONG_ORDER\0" "KDF_FAILED\0" "POINT_ARITHMETIC_FAILURE\0" + "UNKNOWN_DIGEST_LENGTH\0" "BAD_SIGNATURE\0" "NOT_IMPLEMENTED\0" "RANDOM_NUMBER_GENERATION_FAILED\0" @@ -948,12 +394,13 @@ const char kOpenSSLReasonStringData[] = "INVALID_KEYBITS\0" "INVALID_MGF1_MD\0" "INVALID_PADDING_MODE\0" - "INVALID_PARAMETERS\0" + "INVALID_PEER_KEY\0" "INVALID_PSS_SALTLEN\0" "INVALID_SIGNATURE\0" "KEYS_NOT_SET\0" "MEMORY_LIMIT_EXCEEDED\0" "NOT_A_PRIVATE_KEY\0" + "NOT_XOF_OR_INVALID_LENGTH\0" "NO_DEFAULT_DIGEST\0" "NO_KEY_SET\0" "NO_MDC2_SUPPORT\0" @@ -993,6 +440,7 @@ const char kOpenSSLReasonStringData[] = "ENCRYPT_ERROR\0" "ERROR_SETTING_CIPHER_PARAMS\0" "INCORRECT_PASSWORD\0" + "INVALID_CHARACTERS\0" "KEYGEN_FAILURE\0" "KEY_GEN_ERROR\0" "METHOD_NOT_SUPPORTED\0" @@ -1008,6 +456,7 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_DIGEST\0" "UNSUPPORTED_KEYLENGTH\0" "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0" + "UNSUPPORTED_OPTIONS\0" "UNSUPPORTED_PRF\0" "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0" "UNSUPPORTED_SALT_TYPE\0" @@ -1016,6 +465,7 @@ const char kOpenSSLReasonStringData[] = "BAD_PAD_BYTE_COUNT\0" "BAD_RSA_PARAMETERS\0" "BLOCK_TYPE_IS_NOT_01\0" + "BLOCK_TYPE_IS_NOT_02\0" "BN_NOT_INITIALIZED\0" "CANNOT_RECOVER_MULTI_PRIME_KEY\0" "CRT_PARAMS_ALREADY_GIVEN\0" @@ -1028,6 +478,7 @@ const char kOpenSSLReasonStringData[] = "DATA_TOO_SMALL_FOR_KEY_SIZE\0" "DIGEST_TOO_BIG_FOR_RSA_KEY\0" "D_E_NOT_CONGRUENT_TO_1\0" + "D_OUT_OF_RANGE\0" "EMPTY_PUBLIC_KEY\0" "FIRST_OCTET_INVALID\0" "INCONSISTENT_SET_OF_CRT_VALUES\0" @@ -1052,6 +503,7 @@ const char kOpenSSLReasonStringData[] = "WRONG_SIGNATURE_LENGTH\0" "ALPN_MISMATCH_ON_EARLY_DATA\0" "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0" + "APPLICATION_DATA_ON_SHUTDOWN\0" "APP_DATA_IN_HANDSHAKE\0" "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0" "BAD_ALERT\0" @@ -1081,6 +533,7 @@ const char kOpenSSLReasonStringData[] = "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0" "CERTIFICATE_VERIFY_FAILED\0" "CERT_CB_ERROR\0" + "CERT_DECOMPRESSION_FAILED\0" "CERT_LENGTH_MISMATCH\0" "CHANNEL_ID_NOT_P256\0" "CHANNEL_ID_SIGNATURE_INVALID\0" @@ -1100,6 +553,8 @@ const char kOpenSSLReasonStringData[] = "DTLS_MESSAGE_TOO_BIG\0" "DUPLICATE_EXTENSION\0" "DUPLICATE_KEY_SHARE\0" + "DUPLICATE_SIGNATURE_ALGORITHM\0" + "EARLY_DATA_NOT_IN_USE\0" "ECC_CERT_NOT_FOR_SIGNING\0" "EMPTY_HELLO_RETRY_REQUEST\0" "EMS_STATE_INCONSISTENT\0" @@ -1112,17 +567,22 @@ const char kOpenSSLReasonStringData[] = "FRAGMENT_MISMATCH\0" "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0" "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0" + "HANDSHAKE_NOT_COMPLETE\0" "HTTPS_PROXY_REQUEST\0" "HTTP_REQUEST\0" "INAPPROPRIATE_FALLBACK\0" + "INCONSISTENT_CLIENT_HELLO\0" "INVALID_ALPN_PROTOCOL\0" "INVALID_COMMAND\0" "INVALID_COMPRESSION_LIST\0" + "INVALID_DELEGATED_CREDENTIAL\0" "INVALID_MESSAGE\0" "INVALID_OUTER_RECORD_TYPE\0" "INVALID_SCT_LIST\0" + "INVALID_SIGNATURE_ALGORITHM\0" "INVALID_SSL_SESSION\0" "INVALID_TICKET_KEYS_LENGTH\0" + "KEY_USAGE_BIT_INCORRECT\0" "LENGTH_MISMATCH\0" "MISSING_EXTENSION\0" "MISSING_KEY_SHARE\0" @@ -1132,6 +592,7 @@ const char kOpenSSLReasonStringData[] = "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0" "MTU_TOO_SMALL\0" "NEGOTIATED_BOTH_NPN_AND_ALPN\0" + "NEGOTIATED_TB_WITHOUT_EMS_OR_RI\0" "NESTED_GROUP\0" "NO_CERTIFICATES_RETURNED\0" "NO_CERTIFICATE_ASSIGNED\0" @@ -1153,6 +614,7 @@ const char kOpenSSLReasonStringData[] = "NO_SUPPORTED_VERSIONS_ENABLED\0" "NULL_SSL_CTX\0" "NULL_SSL_METHOD_PASSED\0" + "OCSP_CB_ERROR\0" "OLD_SESSION_CIPHER_NOT_RETURNED\0" "OLD_SESSION_PRF_HASH_MISMATCH\0" "OLD_SESSION_VERSION_NOT_RETURNED\0" @@ -1161,11 +623,13 @@ const char kOpenSSLReasonStringData[] = "PEER_DID_NOT_RETURN_A_CERTIFICATE\0" "PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE\0" "PRE_SHARED_KEY_MUST_BE_LAST\0" + "PRIVATE_KEY_OPERATION_FAILED\0" "PROTOCOL_IS_SHUTDOWN\0" "PSK_IDENTITY_BINDER_COUNT_MISMATCH\0" "PSK_IDENTITY_NOT_FOUND\0" "PSK_NO_CLIENT_CB\0" "PSK_NO_SERVER_CB\0" + "QUIC_INTERNAL_ERROR\0" "READ_TIMEOUT_EXPIRED\0" "RECORD_LENGTH_MISMATCH\0" "RECORD_TOO_LARGE\0" @@ -1176,8 +640,10 @@ const char kOpenSSLReasonStringData[] = "RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION\0" "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0" "SCSV_RECEIVED_WHEN_RENEGOTIATING\0" + "SECOND_SERVERHELLO_VERSION_MISMATCH\0" "SERVERHELLO_TLSEXT\0" "SERVER_CERT_CHANGED\0" + "SERVER_ECHOED_INVALID_SESSION_ID\0" "SESSION_ID_CONTEXT_UNINITIALIZED\0" "SESSION_MAY_NOT_BE_CREATED\0" "SHUTDOWN_WHILE_IN_INIT\0" @@ -1200,7 +666,9 @@ const char kOpenSSLReasonStringData[] = "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0" "SSL_HANDSHAKE_FAILURE\0" "SSL_SESSION_ID_CONTEXT_TOO_LONG\0" + "SSL_SESSION_ID_TOO_LONG\0" "TICKET_ENCRYPTION_FAILED\0" + "TLS13_DOWNGRADE\0" "TLSV1_ALERT_ACCESS_DENIED\0" "TLSV1_ALERT_DECODE_ERROR\0" "TLSV1_ALERT_DECRYPTION_FAILED\0" @@ -1229,6 +697,7 @@ const char kOpenSSLReasonStringData[] = "TOO_MUCH_READ_EARLY_DATA\0" "TOO_MUCH_SKIPPED_EARLY_DATA\0" "UNABLE_TO_FIND_ECDH_PARAMETERS\0" + "UNCOMPRESSED_CERT_TOO_LARGE\0" "UNEXPECTED_EXTENSION\0" "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0" "UNEXPECTED_MESSAGE\0" @@ -1236,6 +705,7 @@ const char kOpenSSLReasonStringData[] = "UNEXPECTED_RECORD\0" "UNKNOWN_ALERT_TYPE\0" "UNKNOWN_CERTIFICATE_TYPE\0" + "UNKNOWN_CERT_COMPRESSION_ALG\0" "UNKNOWN_CIPHER_RETURNED\0" "UNKNOWN_CIPHER_TYPE\0" "UNKNOWN_KEY_EXCHANGE_TYPE\0" @@ -1250,6 +720,7 @@ const char kOpenSSLReasonStringData[] = "WRONG_CERTIFICATE_TYPE\0" "WRONG_CIPHER_RETURNED\0" "WRONG_CURVE\0" + "WRONG_ENCRYPTION_LEVEL_RECEIVED\0" "WRONG_MESSAGE_TYPE\0" "WRONG_SIGNATURE_TYPE\0" "WRONG_SSL_VERSION\0" @@ -1282,6 +753,7 @@ const char kOpenSSLReasonStringData[] = "PUBLIC_KEY_DECODE_ERROR\0" "PUBLIC_KEY_ENCODE_ERROR\0" "SHOULD_RETRY\0" + "SIGNATURE_ALGORITHM_MISMATCH\0" "UNKNOWN_KEY_TYPE\0" "UNKNOWN_PURPOSE_ID\0" "UNKNOWN_TRUST_ID\0" diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S new file mode 100644 index 0000000000..b14466ddd7 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S @@ -0,0 +1,1982 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + + + +.section __TEXT,__const + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl _ChaCha20_ctr32 +.private_extern _ChaCha20_ctr32 + +.align 5 +_ChaCha20_ctr32: + cbz x2,Labort +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P +#else + adrp x5,_OPENSSL_armcap_P@PAGE +#endif + cmp x2,#192 + b.lo Lshort + ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +Lshort: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __ARMEB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +Labort: + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + + + +.align 5 +ChaCha20_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + + +.align 5 +ChaCha20_512_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S new file mode 100644 index 0000000000..dc2d6e432c --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S @@ -0,0 +1,772 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +#if __ARM_MAX_ARCH__>=7 +.text + +.section __TEXT,__const +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key + +.align 5 +_aes_hw_set_encrypt_key: +Lenc_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-1 + cmp x0,#0 + b.eq Lenc_key_abort + cmp x2,#0 + b.eq Lenc_key_abort + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon@PAGE + add x3,x3,Lrcon@PAGEOFF + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + b.eq L192 + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +.align 4 +L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b Ldone + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key + +.align 5 +_aes_hw_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl Lenc_key + + cmp x0,#0 + b.ne Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +Ldec_key_abort: + ldp x29,x30,[sp],#16 + ret + +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt + +.align 5 +_aes_hw_encrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt + +.align 5 +_aes_hw_decrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt + +.align 5 +_aes_hw_cbc_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b Lcbc_done + +.align 5 +Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b Lenter_cbc_enc128 +Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b Lcbc_done +.align 5 +Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v18.16b + // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq Lcbc_done + nop + +Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b Lcbc_done + +Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +Lcbc_done: + st1 {v6.16b},[x4] +Lcbc_abort: + ldr x29,[sp],#16 + ret + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks + +.align 5 +_aes_hw_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo +#ifndef __ARMEB__ + rev w8, w8 +#endif + orr v1.16b,v0.16b,v0.16b + add w10, w8, #1 + orr v18.16b,v0.16b,v0.16b + add w8, w8, #2 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v1.s[3],w10 + b.ls Lctr32_tail + rev w12, w8 + sub x2,x2,#3 // bias + mov v18.s[3],w12 + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + orr v0.16b,v6.16b,v6.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + orr v18.16b,v6.16b,v6.16b + add w9,w8,#1 + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + eor v19.16b,v19.16b,v7.16b + rev w9,w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + mov v0.s[3], w9 + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + mov v1.s[3], w10 + rev w12,w8 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + mov v18.s[3], w12 + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S new file mode 100644 index 0000000000..3d83f4d8d6 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S @@ -0,0 +1,1420 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.globl _bn_mul_mont +.private_extern _bn_mul_mont + +.align 5 +_bn_mul_mont: + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + + +.align 5 +__bn_sqr8x_mont: + cmp x1,x2 + b.ne __bn_mul4x_mont +Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret + + +.align 5 +__bn_mul4x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S new file mode 100644 index 0000000000..60bff31018 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S @@ -0,0 +1,338 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.globl _gcm_init_neon +.private_extern _gcm_init_neon + +.align 4 +_gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon + +.align 4 +_gcm_gmult_neon: + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon + +.align 4 +_gcm_ghash_neon: + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section __TEXT,__const +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S new file mode 100644 index 0000000000..be0e283c36 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S @@ -0,0 +1,246 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text + +.globl _gcm_init_v8 +.private_extern _gcm_init_v8 + +.align 4 +_gcm_init_v8: + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] + + ret + +.globl _gcm_gmult_v8 +.private_extern _gcm_gmult_v8 + +.align 4 +_gcm_gmult_v8: + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.globl _gcm_ghash_v8 +.private_extern _gcm_ghash_v8 + +.align 4 +_gcm_ghash_v8: + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq Ldone_v8 //is x3 zero? +Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +Ldone_v8: +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S new file mode 100644 index 0000000000..379107efbf --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -0,0 +1,1232 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text + + +.globl _sha1_block_data_order +.private_extern _sha1_block_data_order + +.align 6 +_sha1_block_data_order: +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P +#else + adrp x16,_OPENSSL_armcap_P@PAGE +#endif + ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] + tst w16,#ARMV8_SHA1 + b.ne Lv8_entry + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __ARMEB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret + + +.align 6 +sha1_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adrp x4,Lconst@PAGE + add x4,x4,Lconst@PAGEOFF + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b +.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + +.long 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + +.long 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret + +.section __TEXT,__const +.align 6 +Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.comm _OPENSSL_armcap_P,4,4 +.private_extern _OPENSSL_armcap_P +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S new file mode 100644 index 0000000000..d6fa5a930d --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -0,0 +1,1210 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif + +.text + + +.globl _sha256_block_data_order +.private_extern _sha256_block_data_order + +.align 6 +_sha256_block_data_order: +#ifndef __KERNEL__ +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P +#else + adrp x16,_OPENSSL_armcap_P@PAGE +#endif + ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] + tst w16,#ARMV8_SHA256 + b.ne Lv8_entry +#endif + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256@PAGE + add x30,x30,LK256@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __ARMEB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret + + +.section __TEXT,__const +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ + +.align 6 +sha256_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256@PAGE + add x3,x3,LK256@PAGEOFF + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#ifndef __KERNEL__ +.comm _OPENSSL_armcap_P,4,4 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S new file mode 100644 index 0000000000..29e122b180 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -0,0 +1,1082 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif + +.text + + +.globl _sha512_block_data_order +.private_extern _sha512_block_data_order + +.align 6 +_sha512_block_data_order: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512@PAGE + add x30,x30,LK512@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __ARMEB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret + + +.section __TEXT,__const +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#ifndef __KERNEL__ +.comm _OPENSSL_armcap_P,4,4 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S new file mode 100644 index 0000000000..0f5cbeadaf --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S @@ -0,0 +1,1213 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section __TEXT,__const + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## + +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## + +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt + +.align 4 +_vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret + + + +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + + + +.align 4 +_vpaes_decrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + adrp x11, Lk_dipt@PAGE + add x11, x11, Lk_dipt@PAGEOFF + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe + ret + + +## +## Decryption core +## +## Same API as encryption core. +## + +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr@PAGE + add x10, x10, Lk_sr@PAGEOFF + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward@PAGE+48 + add x10, x10, Lk_mc_forward@PAGEOFF+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Ldec_entry + +.align 4 +Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret + + +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt + +.align 4 +_vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret + + +// v14-v15 input, v0-v1 output + +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr@PAGE + add x10, x10, Lk_sr@PAGEOFF + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward@PAGE+48 + add x10, x10, Lk_mc_forward@PAGEOFF+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Ldec_2x_entry + +.align 4 +Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## + +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1@PAGE + add x11, x11, Lk_sb1@PAGEOFF + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd@PAGE + add x10, x10, Lk_dksd@PAGEOFF + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward@PAGE + add x11, x11, Lk_mc_forward@PAGEOFF + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + + +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 + add x10, x10, Lk_sr@PAGEOFF + + add x8, x8, x10 + cbnz w3, Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, Lk_deskew@PAGEOFF + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, Lk_opt@PAGEOFF + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## + +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## + +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## + +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## + +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key + +.align 4 +_vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + + +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key + +.align 4 +_vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + +.globl _vpaes_cbc_encrypt +.private_extern _vpaes_cbc_encrypt + +.align 4 +_vpaes_cbc_encrypt: + cbz x2, Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b Lcbc_enc_loop + +.align 4 +Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +Lcbc_abort: + ret + + + +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls Lcbc_dec_done + +.align 4 +Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi Lcbc_dec_loop2x + +Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.align 4 +_vpaes_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S new file mode 100644 index 0000000000..438e9298c0 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S @@ -0,0 +1,685 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +Labi_test_trampoline_begin: + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x7, [x10], #8 + +Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq Lx29_ok + str xzr, [x1] + +Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + ret + + +.globl _abi_test_clobber_x0 +.private_extern _abi_test_clobber_x0 +.align 4 +_abi_test_clobber_x0: + mov x0, xzr + ret + + +.globl _abi_test_clobber_x1 +.private_extern _abi_test_clobber_x1 +.align 4 +_abi_test_clobber_x1: + mov x1, xzr + ret + + +.globl _abi_test_clobber_x2 +.private_extern _abi_test_clobber_x2 +.align 4 +_abi_test_clobber_x2: + mov x2, xzr + ret + + +.globl _abi_test_clobber_x3 +.private_extern _abi_test_clobber_x3 +.align 4 +_abi_test_clobber_x3: + mov x3, xzr + ret + + +.globl _abi_test_clobber_x4 +.private_extern _abi_test_clobber_x4 +.align 4 +_abi_test_clobber_x4: + mov x4, xzr + ret + + +.globl _abi_test_clobber_x5 +.private_extern _abi_test_clobber_x5 +.align 4 +_abi_test_clobber_x5: + mov x5, xzr + ret + + +.globl _abi_test_clobber_x6 +.private_extern _abi_test_clobber_x6 +.align 4 +_abi_test_clobber_x6: + mov x6, xzr + ret + + +.globl _abi_test_clobber_x7 +.private_extern _abi_test_clobber_x7 +.align 4 +_abi_test_clobber_x7: + mov x7, xzr + ret + + +.globl _abi_test_clobber_x8 +.private_extern _abi_test_clobber_x8 +.align 4 +_abi_test_clobber_x8: + mov x8, xzr + ret + + +.globl _abi_test_clobber_x9 +.private_extern _abi_test_clobber_x9 +.align 4 +_abi_test_clobber_x9: + mov x9, xzr + ret + + +.globl _abi_test_clobber_x10 +.private_extern _abi_test_clobber_x10 +.align 4 +_abi_test_clobber_x10: + mov x10, xzr + ret + + +.globl _abi_test_clobber_x11 +.private_extern _abi_test_clobber_x11 +.align 4 +_abi_test_clobber_x11: + mov x11, xzr + ret + + +.globl _abi_test_clobber_x12 +.private_extern _abi_test_clobber_x12 +.align 4 +_abi_test_clobber_x12: + mov x12, xzr + ret + + +.globl _abi_test_clobber_x13 +.private_extern _abi_test_clobber_x13 +.align 4 +_abi_test_clobber_x13: + mov x13, xzr + ret + + +.globl _abi_test_clobber_x14 +.private_extern _abi_test_clobber_x14 +.align 4 +_abi_test_clobber_x14: + mov x14, xzr + ret + + +.globl _abi_test_clobber_x15 +.private_extern _abi_test_clobber_x15 +.align 4 +_abi_test_clobber_x15: + mov x15, xzr + ret + + +.globl _abi_test_clobber_x16 +.private_extern _abi_test_clobber_x16 +.align 4 +_abi_test_clobber_x16: + mov x16, xzr + ret + + +.globl _abi_test_clobber_x17 +.private_extern _abi_test_clobber_x17 +.align 4 +_abi_test_clobber_x17: + mov x17, xzr + ret + + +.globl _abi_test_clobber_x19 +.private_extern _abi_test_clobber_x19 +.align 4 +_abi_test_clobber_x19: + mov x19, xzr + ret + + +.globl _abi_test_clobber_x20 +.private_extern _abi_test_clobber_x20 +.align 4 +_abi_test_clobber_x20: + mov x20, xzr + ret + + +.globl _abi_test_clobber_x21 +.private_extern _abi_test_clobber_x21 +.align 4 +_abi_test_clobber_x21: + mov x21, xzr + ret + + +.globl _abi_test_clobber_x22 +.private_extern _abi_test_clobber_x22 +.align 4 +_abi_test_clobber_x22: + mov x22, xzr + ret + + +.globl _abi_test_clobber_x23 +.private_extern _abi_test_clobber_x23 +.align 4 +_abi_test_clobber_x23: + mov x23, xzr + ret + + +.globl _abi_test_clobber_x24 +.private_extern _abi_test_clobber_x24 +.align 4 +_abi_test_clobber_x24: + mov x24, xzr + ret + + +.globl _abi_test_clobber_x25 +.private_extern _abi_test_clobber_x25 +.align 4 +_abi_test_clobber_x25: + mov x25, xzr + ret + + +.globl _abi_test_clobber_x26 +.private_extern _abi_test_clobber_x26 +.align 4 +_abi_test_clobber_x26: + mov x26, xzr + ret + + +.globl _abi_test_clobber_x27 +.private_extern _abi_test_clobber_x27 +.align 4 +_abi_test_clobber_x27: + mov x27, xzr + ret + + +.globl _abi_test_clobber_x28 +.private_extern _abi_test_clobber_x28 +.align 4 +_abi_test_clobber_x28: + mov x28, xzr + ret + + +.globl _abi_test_clobber_x29 +.private_extern _abi_test_clobber_x29 +.align 4 +_abi_test_clobber_x29: + mov x29, xzr + ret + + +.globl _abi_test_clobber_d0 +.private_extern _abi_test_clobber_d0 +.align 4 +_abi_test_clobber_d0: + fmov d0, xzr + ret + + +.globl _abi_test_clobber_d1 +.private_extern _abi_test_clobber_d1 +.align 4 +_abi_test_clobber_d1: + fmov d1, xzr + ret + + +.globl _abi_test_clobber_d2 +.private_extern _abi_test_clobber_d2 +.align 4 +_abi_test_clobber_d2: + fmov d2, xzr + ret + + +.globl _abi_test_clobber_d3 +.private_extern _abi_test_clobber_d3 +.align 4 +_abi_test_clobber_d3: + fmov d3, xzr + ret + + +.globl _abi_test_clobber_d4 +.private_extern _abi_test_clobber_d4 +.align 4 +_abi_test_clobber_d4: + fmov d4, xzr + ret + + +.globl _abi_test_clobber_d5 +.private_extern _abi_test_clobber_d5 +.align 4 +_abi_test_clobber_d5: + fmov d5, xzr + ret + + +.globl _abi_test_clobber_d6 +.private_extern _abi_test_clobber_d6 +.align 4 +_abi_test_clobber_d6: + fmov d6, xzr + ret + + +.globl _abi_test_clobber_d7 +.private_extern _abi_test_clobber_d7 +.align 4 +_abi_test_clobber_d7: + fmov d7, xzr + ret + + +.globl _abi_test_clobber_d8 +.private_extern _abi_test_clobber_d8 +.align 4 +_abi_test_clobber_d8: + fmov d8, xzr + ret + + +.globl _abi_test_clobber_d9 +.private_extern _abi_test_clobber_d9 +.align 4 +_abi_test_clobber_d9: + fmov d9, xzr + ret + + +.globl _abi_test_clobber_d10 +.private_extern _abi_test_clobber_d10 +.align 4 +_abi_test_clobber_d10: + fmov d10, xzr + ret + + +.globl _abi_test_clobber_d11 +.private_extern _abi_test_clobber_d11 +.align 4 +_abi_test_clobber_d11: + fmov d11, xzr + ret + + +.globl _abi_test_clobber_d12 +.private_extern _abi_test_clobber_d12 +.align 4 +_abi_test_clobber_d12: + fmov d12, xzr + ret + + +.globl _abi_test_clobber_d13 +.private_extern _abi_test_clobber_d13 +.align 4 +_abi_test_clobber_d13: + fmov d13, xzr + ret + + +.globl _abi_test_clobber_d14 +.private_extern _abi_test_clobber_d14 +.align 4 +_abi_test_clobber_d14: + fmov d14, xzr + ret + + +.globl _abi_test_clobber_d15 +.private_extern _abi_test_clobber_d15 +.align 4 +_abi_test_clobber_d15: + fmov d15, xzr + ret + + +.globl _abi_test_clobber_d16 +.private_extern _abi_test_clobber_d16 +.align 4 +_abi_test_clobber_d16: + fmov d16, xzr + ret + + +.globl _abi_test_clobber_d17 +.private_extern _abi_test_clobber_d17 +.align 4 +_abi_test_clobber_d17: + fmov d17, xzr + ret + + +.globl _abi_test_clobber_d18 +.private_extern _abi_test_clobber_d18 +.align 4 +_abi_test_clobber_d18: + fmov d18, xzr + ret + + +.globl _abi_test_clobber_d19 +.private_extern _abi_test_clobber_d19 +.align 4 +_abi_test_clobber_d19: + fmov d19, xzr + ret + + +.globl _abi_test_clobber_d20 +.private_extern _abi_test_clobber_d20 +.align 4 +_abi_test_clobber_d20: + fmov d20, xzr + ret + + +.globl _abi_test_clobber_d21 +.private_extern _abi_test_clobber_d21 +.align 4 +_abi_test_clobber_d21: + fmov d21, xzr + ret + + +.globl _abi_test_clobber_d22 +.private_extern _abi_test_clobber_d22 +.align 4 +_abi_test_clobber_d22: + fmov d22, xzr + ret + + +.globl _abi_test_clobber_d23 +.private_extern _abi_test_clobber_d23 +.align 4 +_abi_test_clobber_d23: + fmov d23, xzr + ret + + +.globl _abi_test_clobber_d24 +.private_extern _abi_test_clobber_d24 +.align 4 +_abi_test_clobber_d24: + fmov d24, xzr + ret + + +.globl _abi_test_clobber_d25 +.private_extern _abi_test_clobber_d25 +.align 4 +_abi_test_clobber_d25: + fmov d25, xzr + ret + + +.globl _abi_test_clobber_d26 +.private_extern _abi_test_clobber_d26 +.align 4 +_abi_test_clobber_d26: + fmov d26, xzr + ret + + +.globl _abi_test_clobber_d27 +.private_extern _abi_test_clobber_d27 +.align 4 +_abi_test_clobber_d27: + fmov d27, xzr + ret + + +.globl _abi_test_clobber_d28 +.private_extern _abi_test_clobber_d28 +.align 4 +_abi_test_clobber_d28: + fmov d28, xzr + ret + + +.globl _abi_test_clobber_d29 +.private_extern _abi_test_clobber_d29 +.align 4 +_abi_test_clobber_d29: + fmov d29, xzr + ret + + +.globl _abi_test_clobber_d30 +.private_extern _abi_test_clobber_d30 +.align 4 +_abi_test_clobber_d30: + fmov d30, xzr + ret + + +.globl _abi_test_clobber_d31 +.private_extern _abi_test_clobber_d31 +.align 4 +_abi_test_clobber_d31: + fmov d31, xzr + ret + + +.globl _abi_test_clobber_v8_upper +.private_extern _abi_test_clobber_v8_upper +.align 4 +_abi_test_clobber_v8_upper: + fmov v8.d[1], xzr + ret + + +.globl _abi_test_clobber_v9_upper +.private_extern _abi_test_clobber_v9_upper +.align 4 +_abi_test_clobber_v9_upper: + fmov v9.d[1], xzr + ret + + +.globl _abi_test_clobber_v10_upper +.private_extern _abi_test_clobber_v10_upper +.align 4 +_abi_test_clobber_v10_upper: + fmov v10.d[1], xzr + ret + + +.globl _abi_test_clobber_v11_upper +.private_extern _abi_test_clobber_v11_upper +.align 4 +_abi_test_clobber_v11_upper: + fmov v11.d[1], xzr + ret + + +.globl _abi_test_clobber_v12_upper +.private_extern _abi_test_clobber_v12_upper +.align 4 +_abi_test_clobber_v12_upper: + fmov v12.d[1], xzr + ret + + +.globl _abi_test_clobber_v13_upper +.private_extern _abi_test_clobber_v13_upper +.align 4 +_abi_test_clobber_v13_upper: + fmov v13.d[1], xzr + ret + + +.globl _abi_test_clobber_v14_upper +.private_extern _abi_test_clobber_v14_upper +.align 4 +_abi_test_clobber_v14_upper: + fmov v14.d[1], xzr + ret + + +.globl _abi_test_clobber_v15_upper +.private_extern _abi_test_clobber_v15_upper +.align 4 +_abi_test_clobber_v15_upper: + fmov v15.d[1], xzr + ret + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S new file mode 100644 index 0000000000..c48863f65f --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S @@ -0,0 +1,996 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section __TEXT,__const + +# p434 x 2 +Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 + +# p434 + 1 +Lp434p1: +.quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056, 0x0002341F27177344 + +.text +.globl _sike_mpmul +.private_extern _sike_mpmul +.align 4 +_sike_mpmul: + stp x29, x30, [sp,#-96]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + ldp x14, x15, [x1,#32] + ldr x16, [x1,#48] + + // x3-x7 <- AH + AL, x7 <- carry + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, xzr + adc x7, xzr, xzr + + // x10-x13 <- BH + BL, x8 <- carry + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, xzr + adc x8, xzr, xzr + + // x9 <- combined carry + and x9, x7, x8 + // x7-x8 <- mask + sub x7, xzr, x7 + sub x8, xzr, x8 + + // x15-x19 <- masked (BH + BL) + and x14, x10, x7 + and x15, x11, x7 + and x16, x12, x7 + and x17, x13, x7 + + // x20-x23 <- masked (AH + AL) + and x20, x3, x8 + and x21, x4, x8 + and x22, x5, x8 + and x23, x6, x8 + + // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x14, x14, x20 + adcs x15, x15, x21 + adcs x16, x16, x22 + adcs x17, x17, x23 + adc x7, x9, xzr + + // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part + stp x3, x4, [x2,#0] + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x25, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x23, x10, x12 + adcs x26, x11, x13 + adc x24, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x19, xzr, x25 + sub x20, xzr, x24 + and x8, x23, x19 + and x9, x26, x19 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x21, x3, x20 + and x22, x4, x20 + mul x19, x3, x23 + mul x20, x3, x26 + and x25, x25, x24 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x8, x21, x8 + umulh x21, x3, x26 + adcs x9, x22, x9 + umulh x22, x3, x23 + adc x25, x25, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x23 + umulh x23, x4, x23 + adds x20, x20, x22 + adc x21, x21, xzr + + mul x24, x4, x26 + umulh x26, x4, x26 + adds x20, x20, x3 + adcs x21, x21, x23 + adc x22, xzr, xzr + + adds x21, x21, x24 + adc x22, x22, x26 + + ldp x3, x4, [x2,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x21, x8, x21 + umulh x24, x3, x10 + umulh x26, x3, x11 + adcs x22, x9, x22 + mul x8, x3, x10 + mul x9, x3, x11 + adc x25, x25, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x9, x9, x24 + adc x26, x26, xzr + + mul x23, x4, x11 + umulh x11, x4, x11 + adds x9, x9, x3 + adcs x26, x26, x10 + adc x24, xzr, xzr + + adds x26, x26, x23 + adc x24, x24, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x19, x19, x8 + sbcs x20, x20, x9 + sbcs x21, x21, x26 + mul x4, x5, x13 + umulh x23, x5, x13 + sbcs x22, x22, x24 + sbc x25, x25, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x23, x23, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x23, x23, x12 + adc x10, xzr, xzr + + adds x23, x23, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x19, x19, x3 + sbcs x20, x20, x4 + sbcs x21, x21, x23 + sbcs x22, x22, x10 + sbc x25, x25, xzr + + adds x19, x19, x26 + adcs x20, x20, x24 + adcs x21, x21, x3 + adcs x22, x22, x4 + adcs x23, x25, x23 + adc x24, x10, xzr + + + // x15-x19, x7 <- (AH+AL) x (BH+BL), final step + adds x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adc x7, x7, xzr + + // Load AL + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + // Load BL + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + + // Temporarily store x8 in x2 + stp x8, x9, [x2,#0] + // x21-x28 <- AL x BL + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x8, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x27, x10, x12 + adcs x9, x11, x13 + adc x28, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x23, xzr, x8 + sub x24, xzr, x28 + and x21, x27, x23 + and x22, x9, x23 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x25, x3, x24 + and x26, x4, x24 + mul x23, x3, x27 + mul x24, x3, x9 + and x8, x8, x28 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x21, x25, x21 + umulh x25, x3, x9 + adcs x22, x26, x22 + umulh x26, x3, x27 + adc x8, x8, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x27 + umulh x27, x4, x27 + adds x24, x24, x26 + adc x25, x25, xzr + + mul x28, x4, x9 + umulh x9, x4, x9 + adds x24, x24, x3 + adcs x25, x25, x27 + adc x26, xzr, xzr + + adds x25, x25, x28 + adc x26, x26, x9 + + ldp x3, x4, [x0,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x25, x21, x25 + umulh x28, x3, x10 + umulh x9, x3, x11 + adcs x26, x22, x26 + mul x21, x3, x10 + mul x22, x3, x11 + adc x8, x8, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x22, x22, x28 + adc x9, x9, xzr + + mul x27, x4, x11 + umulh x11, x4, x11 + adds x22, x22, x3 + adcs x9, x9, x10 + adc x28, xzr, xzr + + adds x9, x9, x27 + adc x28, x28, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x23, x23, x21 + sbcs x24, x24, x22 + sbcs x25, x25, x9 + mul x4, x5, x13 + umulh x27, x5, x13 + sbcs x26, x26, x28 + sbc x8, x8, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x27, x27, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x27, x27, x12 + adc x10, xzr, xzr + + adds x27, x27, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x23, x23, x3 + sbcs x24, x24, x4 + sbcs x25, x25, x27 + sbcs x26, x26, x10 + sbc x8, x8, xzr + + adds x23, x23, x9 + adcs x24, x24, x28 + adcs x25, x25, x3 + adcs x26, x26, x4 + adcs x27, x8, x27 + adc x28, x10, xzr + + // Restore x8 + ldp x8, x9, [x2,#0] + + // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + sbc x7, x7, xzr + + // Store ALxBL, low + stp x21, x22, [x2] + stp x23, x24, [x2,#16] + + // Load AH + ldp x3, x4, [x0,#32] + ldr x5, [x0,#48] + // Load BH + ldp x10, x11, [x1,#32] + ldr x12, [x1,#48] + + adds x8, x8, x25 + adcs x9, x9, x26 + adcs x19, x19, x27 + adcs x20, x20, x28 + adc x1, xzr, xzr + + add x0, x0, #32 + // Temporarily store x8,x9 in x2 + stp x8,x9, [x2,#32] + // x21-x28 <- AH x BH + + // A0 * B0 + mul x21, x3, x10 // C0 + umulh x24, x3, x10 + + // A0 * B1 + mul x22, x3, x11 + umulh x23, x3, x11 + + // A1 * B0 + mul x8, x4, x10 + umulh x9, x4, x10 + adds x22, x22, x24 + adc x23, x23, xzr + + // A0 * B2 + mul x27, x3, x12 + umulh x28, x3, x12 + adds x22, x22, x8 // C1 + adcs x23, x23, x9 + adc x24, xzr, xzr + + // A2 * B0 + mul x8, x5, x10 + umulh x25, x5, x10 + adds x23, x23, x27 + adcs x24, x24, x25 + adc x25, xzr, xzr + + // A1 * B1 + mul x27, x4, x11 + umulh x9, x4, x11 + adds x23, x23, x8 + adcs x24, x24, x28 + adc x25, x25, xzr + + // A1 * B2 + mul x8, x4, x12 + umulh x28, x4, x12 + adds x23, x23, x27 // C2 + adcs x24, x24, x9 + adc x25, x25, xzr + + // A2 * B1 + mul x27, x5, x11 + umulh x9, x5, x11 + adds x24, x24, x8 + adcs x25, x25, x28 + adc x26, xzr, xzr + + // A2 * B2 + mul x8, x5, x12 + umulh x28, x5, x12 + adds x24, x24, x27 // C3 + adcs x25, x25, x9 + adc x26, x26, xzr + + adds x25, x25, x8 // C4 + adc x26, x26, x28 // C5 + + // Restore x8,x9 + ldp x8,x9, [x2,#32] + + neg x1, x1 + + // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, xzr + sbcs x17, x17, xzr + sbc x7, x7, xzr + + // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low + stp x8, x9, [x2,#32] + stp x19, x20, [x2,#48] + + adds x1, x1, #1 + adcs x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adcs x25, x7, x25 + adc x26, x26, xzr + + stp x14, x15, [x2,#64] + stp x16, x17, [x2,#80] + stp x25, x26, [x2,#96] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl _sike_fprdc +.private_extern _sike_fprdc +.align 4 +_sike_fprdc: + stp x29, x30, [sp, #-96]! + add x29, sp, xzr + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + adrp x26, Lp434p1@PAGE + add x26, x26, Lp434p1@PAGEOFF + ldp x23, x24, [x26, #0x0] + ldp x25, x26, [x26,#0x10] + + // a[0-1] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x10, x3, x23 + umulh x11, x3, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x11 + adc x7, xzr, xzr + + mul x10, x3, x24 + umulh x11, x3, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x11 + adc x8, x8, xzr + + mul x10, x3, x25 + umulh x11, x3, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x3, x26 + umulh x28, x3, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x11 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + ldp x10, x11, [x0, #0x18] + ldp x12, x13, [x0, #0x28] + ldp x14, x15, [x0, #0x38] + ldp x16, x17, [x0, #0x48] + ldp x19, x20, [x0, #0x58] + ldr x21, [x0, #0x68] + + adds x10, x10, x4 + adcs x11, x11, x5 + adcs x12, x12, x6 + adcs x13, x13, x7 + adcs x14, x14, x8 + adcs x15, x15, x9 + adcs x22, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + ldr x2, [x0,#0x10] // a[2] + // a[2-3] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x0, x10, x23 + umulh x3, x10, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x0 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x0, x10, x24 + umulh x3, x10, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x0 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x0, x10, x25 + umulh x3, x10, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x10, x26 + umulh x28, x10, x26 + adds x7, x7, x0 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + adds x12, x12, x4 + adcs x13, x13, x5 + adcs x14, x14, x6 + adcs x15, x15, x7 + adcs x16, x22, x8 + adcs x17, x17, x9 + adcs x22, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + mul x4, x11, x23 // C0 + umulh x7, x11, x23 + + mul x5, x11, x24 + umulh x6, x11, x24 + + mul x10, x12, x23 + umulh x3, x12, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x11, x25 + umulh x28, x11, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x10, x12, x24 + umulh x3, x12, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x11, x26 + umulh x28, x11, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x10, x12, x25 + umulh x3, x12, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x12, x26 + umulh x28, x12, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + adds x14, x14, x4 + adcs x15, x15, x5 + adcs x16, x16, x6 + adcs x17, x17, x7 + adcs x19, x22, x8 + adcs x20, x20, x9 + adc x22, x21, xzr + + stp x14, x15, [x1, #0x0] // C0, C1 + + mul x4, x13, x23 // C0 + umulh x10, x13, x23 + + mul x5, x13, x24 + umulh x27, x13, x24 + adds x5, x5, x10 // C1 + adc x10, xzr, xzr + + mul x6, x13, x25 + umulh x28, x13, x25 + adds x27, x10, x27 + adcs x6, x6, x27 // C2 + adc x10, xzr, xzr + + mul x7, x13, x26 + umulh x8, x13, x26 + adds x28, x10, x28 + adcs x7, x7, x28 // C3 + adc x8, x8, xzr // C4 + + adds x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + adcs x20, x20, x7 + adc x21, x22, x8 + + str x16, [x1, #0x10] + stp x17, x19, [x1, #0x18] + stp x20, x21, [x1, #0x28] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl _sike_fpadd +.private_extern _sike_fpadd +.align 4 +_sike_fpadd: + stp x29,x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + // Subtract 2xp434 + adrp x17, Lp434x2@PAGE + add x17, x17, Lp434x2@PAGEOFF + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbc x0, xzr, xzr // x0 can be reused now + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_fpsub +.private_extern _sike_fpsub +.align 4 +_sike_fpsub: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbc x0, xzr, xzr + + // Add 2xp434 anded with the mask in x0 + adrp x17, Lp434x2@PAGE + add x17, x17, Lp434x2@PAGEOFF + + // First half + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_mpadd_asm +.private_extern _sike_mpadd_asm +.align 4 +_sike_mpadd_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_mpsubx2_asm +.private_extern _sike_mpsubx2_asm +.align 4 +_sike_mpsubx2_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + sbcs x10, x10, x14 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x11, x12, [x1,#96] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_mpdblsubx2_asm +.private_extern _sike_mpdblsubx2_asm +.align 4 +_sike_mpdblsubx2_asm: + stp x29, x30, [sp, #-16]! + add x29, sp, #0 + + ldp x3, x4, [x2, #0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + + ldp x11, x12, [x0, #0] + ldp x13, x14, [x0,#16] + ldp x15, x16, [x0,#32] + + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + + // x9 stores carry + adc x9, xzr, xzr + + ldp x11, x12, [x1, #0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2, #0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + + ldp x3, x4, [x2,#48] + ldp x5, x6, [x2,#64] + ldp x7, x8, [x2,#80] + + ldp x11, x12, [x0,#48] + ldp x13, x14, [x0,#64] + ldp x15, x16, [x0,#80] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, xzr, xzr + + ldp x11, x12, [x1,#48] + ldp x13, x14, [x1,#64] + ldp x15, x16, [x1,#80] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2,#48] + stp x5, x6, [x2,#64] + stp x7, x8, [x2,#80] + + ldp x3, x4, [x2,#96] + ldp x11, x12, [x0,#96] + ldp x13, x14, [x1,#96] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + subs x3, x3, x13 + sbc x4, x4, x14 + stp x3, x4, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S new file mode 100644 index 0000000000..cadf2b623b --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S @@ -0,0 +1,1498 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. + + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +Lone: +.long 1,0,0,0 +#if __ARM_MAX_ARCH__>=7 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-LChaCha20_ctr32 +#else +.word -1 +#endif + +.globl _ChaCha20_ctr32 +.private_extern _ChaCha20_ctr32 +#ifdef __thumb2__ +.thumb_func _ChaCha20_ctr32 +#endif +.align 5 +_ChaCha20_ctr32: +LChaCha20_ctr32: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r14,pc,#16 @ _ChaCha20_ctr32 +#else + adr r14,LChaCha20_ctr32 +#endif + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq Lno_data +#if __ARM_MAX_ARCH__>=7 + cmp r2,#192 @ test len + bls Lshort + ldr r4,[r14,#-32] + ldr r4,[r14,r4] +# ifdef __APPLE__ + ldr r4,[r4] +# endif + tst r4,#ARMV7_NEON + bne LChaCha20_neon +Lshort: +#endif + ldmia r12,{r4,r5,r6,r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + sub r14,r14,#64 @ Lsigma + stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + ldmia r14,{r0,r1,r2,r3} @ load sigma + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key + stmdb sp!,{r0,r1,r2,r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + b Loop_outer_enter + +.align 4 +Loop_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + str r11,[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +Loop_outer_enter: + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(16+15)] + mov r11,#10 + b Loop + +.align 4 +Loop: + subs r11,r11,#1 + add r0,r0,r4 + mov r12,r12,ror#16 + add r1,r1,r5 + mov r10,r10,ror#16 + eor r12,r12,r0,ror#16 + eor r10,r10,r1,ror#16 + add r8,r8,r12 + mov r4,r4,ror#20 + add r9,r9,r10 + mov r5,r5,ror#20 + eor r4,r4,r8,ror#20 + eor r5,r5,r9,ror#20 + add r0,r0,r4 + mov r12,r12,ror#24 + add r1,r1,r5 + mov r10,r10,ror#24 + eor r12,r12,r0,ror#24 + eor r10,r10,r1,ror#24 + add r8,r8,r12 + mov r4,r4,ror#25 + add r9,r9,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+13)] + ldr r10,[sp,#4*(16+15)] + eor r4,r4,r8,ror#25 + eor r5,r5,r9,ror#25 + str r8,[sp,#4*(16+8)] + ldr r8,[sp,#4*(16+10)] + add r2,r2,r6 + mov r14,r14,ror#16 + str r9,[sp,#4*(16+9)] + ldr r9,[sp,#4*(16+11)] + add r3,r3,r7 + mov r10,r10,ror#16 + eor r14,r14,r2,ror#16 + eor r10,r10,r3,ror#16 + add r8,r8,r14 + mov r6,r6,ror#20 + add r9,r9,r10 + mov r7,r7,ror#20 + eor r6,r6,r8,ror#20 + eor r7,r7,r9,ror#20 + add r2,r2,r6 + mov r14,r14,ror#24 + add r3,r3,r7 + mov r10,r10,ror#24 + eor r14,r14,r2,ror#24 + eor r10,r10,r3,ror#24 + add r8,r8,r14 + mov r6,r6,ror#25 + add r9,r9,r10 + mov r7,r7,ror#25 + eor r6,r6,r8,ror#25 + eor r7,r7,r9,ror#25 + add r0,r0,r5 + mov r10,r10,ror#16 + add r1,r1,r6 + mov r12,r12,ror#16 + eor r10,r10,r0,ror#16 + eor r12,r12,r1,ror#16 + add r8,r8,r10 + mov r5,r5,ror#20 + add r9,r9,r12 + mov r6,r6,ror#20 + eor r5,r5,r8,ror#20 + eor r6,r6,r9,ror#20 + add r0,r0,r5 + mov r10,r10,ror#24 + add r1,r1,r6 + mov r12,r12,ror#24 + eor r10,r10,r0,ror#24 + eor r12,r12,r1,ror#24 + add r8,r8,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+15)] + ldr r10,[sp,#4*(16+13)] + add r9,r9,r12 + mov r6,r6,ror#25 + eor r5,r5,r8,ror#25 + eor r6,r6,r9,ror#25 + str r8,[sp,#4*(16+10)] + ldr r8,[sp,#4*(16+8)] + add r2,r2,r7 + mov r10,r10,ror#16 + str r9,[sp,#4*(16+11)] + ldr r9,[sp,#4*(16+9)] + add r3,r3,r4 + mov r14,r14,ror#16 + eor r10,r10,r2,ror#16 + eor r14,r14,r3,ror#16 + add r8,r8,r10 + mov r7,r7,ror#20 + add r9,r9,r14 + mov r4,r4,ror#20 + eor r7,r7,r8,ror#20 + eor r4,r4,r9,ror#20 + add r2,r2,r7 + mov r10,r10,ror#24 + add r3,r3,r4 + mov r14,r14,ror#24 + eor r10,r10,r2,ror#24 + eor r14,r14,r3,ror#24 + add r8,r8,r10 + mov r7,r7,ror#25 + add r9,r9,r14 + mov r4,r4,ror#25 + eor r7,r7,r8,ror#25 + eor r4,r4,r9,ror#25 + bne Loop + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + cmp r11,#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr r8,[sp,#4*(0)] @ load key material + ldr r9,[sp,#4*(1)] + +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) +# if __ARM_ARCH__<7 + orr r10,r12,r14 + tst r10,#3 @ are input and output aligned? + ldr r10,[sp,#4*(2)] + bne Lunaligned + cmp r11,#64 @ restore flags +# else + ldr r10,[sp,#4*(2)] +# endif + ldr r11,[sp,#4*(3)] + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 @ xor with input + eorhs r1,r1,r9 + add r8,sp,#4*(4) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r1,[r14,#-12] + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 + add r8,sp,#4*(8) + str r4,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r5,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 + eorhs r1,r1,r9 + add r8,sp,#4*(12) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + str r1,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r4,[r14],#16 @ store output + str r5,[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi Loop_outer + + beq Ldone +# if __ARM_ARCH__<7 + b Ltail + +.align 4 +Lunaligned:@ unaligned endian-neutral path + cmp r11,#64 @ restore flags +# endif +#endif +#if __ARM_ARCH__<7 + ldr r11,[sp,#4*(3)] + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+0) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r0,sp,#4*(16+8) + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] + add r8,sp,#4*(4+4) + ldmia r8,{r8,r9,r10,r11} @ load key material + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" + strhi r11,[sp,#4*(16+11)] @ copy "rx" + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+8) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r4,r4,r8 @ accumulate key material +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + bhi Loop_outer + + beq Ldone +#endif + +Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add r9,sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +Loop_tail: + ldrb r10,[r9],#1 @ read buffer on stack + ldrb r11,[r12],#1 @ read input + subs r8,r8,#1 + eor r11,r11,r10 + strb r11,[r14],#1 @ store output + bne Loop_tail + +Ldone: + add sp,sp,#4*(32+3) +Lno_data: + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} + +#if __ARM_MAX_ARCH__>=7 + + + +#ifdef __thumb2__ +.thumb_func ChaCha20_neon +#endif +.align 5 +ChaCha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} +LChaCha20_neon: + adr r14,Lsigma + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so + stmdb sp!,{r0,r1,r2,r3} + + vld1.32 {q1,q2},[r3] @ load key + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {q3},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0,r1,r2,r3} @ load sigma + vld1.32 {q0},[r14]! @ load sigma + vld1.32 {q12},[r14] @ one + vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce + vst1.32 {q0,q1},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + vshl.i32 d26,d24,#1 @ two + vstr d24,[sp,#4*(16+0)] + vshl.i32 d28,d24,#2 @ four + vstr d26,[sp,#4*(16+2)] + vmov q4,q0 + vstr d28,[sp,#4*(16+4)] + vmov q8,q0 + vmov q5,q1 + vmov q9,q1 + b Loop_neon_enter + +.align 4 +Loop_neon_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + cmp r11,#64*2 @ if len<=64*2 + bls Lbreak_neon @ switch to integer-only + vmov q4,q0 + str r11,[sp,#4*(32+2)] @ save len + vmov q8,q0 + str r12, [sp,#4*(32+1)] @ save inp + vmov q5,q1 + str r14, [sp,#4*(32+0)] @ save out + vmov q9,q1 +Loop_neon_enter: + ldr r11, [sp,#4*(15)] + vadd.i32 q7,q3,q12 @ counter+1 + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + vmov q6,q2 + ldr r10, [sp,#4*(13)] + vmov q10,q2 + ldr r14,[sp,#4*(14)] + vadd.i32 q11,q7,q12 @ counter+2 + str r11, [sp,#4*(16+15)] + mov r11,#10 + add r12,r12,#3 @ counter+3 + b Loop_neon + +.align 4 +Loop_neon: + subs r11,r11,#1 + vadd.i32 q0,q0,q1 + add r0,r0,r4 + vadd.i32 q4,q4,q5 + mov r12,r12,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r5 + veor q3,q3,q0 + mov r10,r10,ror#16 + veor q7,q7,q4 + eor r12,r12,r0,ror#16 + veor q11,q11,q8 + eor r10,r10,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r12 + vrev32.16 q7,q7 + mov r4,r4,ror#20 + vrev32.16 q11,q11 + add r9,r9,r10 + vadd.i32 q2,q2,q3 + mov r5,r5,ror#20 + vadd.i32 q6,q6,q7 + eor r4,r4,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r5,r5,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r4 + veor q13,q5,q6 + mov r12,r12,ror#24 + veor q14,q9,q10 + add r1,r1,r5 + vshr.u32 q1,q12,#20 + mov r10,r10,ror#24 + vshr.u32 q5,q13,#20 + eor r12,r12,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r10,r10,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r12 + vsli.32 q5,q13,#12 + mov r4,r4,ror#25 + vsli.32 q9,q14,#12 + add r9,r9,r10 + vadd.i32 q0,q0,q1 + mov r5,r5,ror#25 + vadd.i32 q4,q4,q5 + str r10,[sp,#4*(16+13)] + vadd.i32 q8,q8,q9 + ldr r10,[sp,#4*(16+15)] + veor q12,q3,q0 + eor r4,r4,r8,ror#25 + veor q13,q7,q4 + eor r5,r5,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+8)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+10)] + vshr.u32 q7,q13,#24 + add r2,r2,r6 + vshr.u32 q11,q14,#24 + mov r14,r14,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+9)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+11)] + vsli.32 q11,q14,#8 + add r3,r3,r7 + vadd.i32 q2,q2,q3 + mov r10,r10,ror#16 + vadd.i32 q6,q6,q7 + eor r14,r14,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r10,r10,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r14 + veor q13,q5,q6 + mov r6,r6,ror#20 + veor q14,q9,q10 + add r9,r9,r10 + vshr.u32 q1,q12,#25 + mov r7,r7,ror#20 + vshr.u32 q5,q13,#25 + eor r6,r6,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r7,r7,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r6 + vsli.32 q5,q13,#7 + mov r14,r14,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r7 + vext.8 q2,q2,q2,#8 + mov r10,r10,ror#24 + vext.8 q6,q6,q6,#8 + eor r14,r14,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r10,r10,r3,ror#24 + vext.8 q1,q1,q1,#4 + add r8,r8,r14 + vext.8 q5,q5,q5,#4 + mov r6,r6,ror#25 + vext.8 q9,q9,q9,#4 + add r9,r9,r10 + vext.8 q3,q3,q3,#12 + mov r7,r7,ror#25 + vext.8 q7,q7,q7,#12 + eor r6,r6,r8,ror#25 + vext.8 q11,q11,q11,#12 + eor r7,r7,r9,ror#25 + vadd.i32 q0,q0,q1 + add r0,r0,r5 + vadd.i32 q4,q4,q5 + mov r10,r10,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r6 + veor q3,q3,q0 + mov r12,r12,ror#16 + veor q7,q7,q4 + eor r10,r10,r0,ror#16 + veor q11,q11,q8 + eor r12,r12,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r10 + vrev32.16 q7,q7 + mov r5,r5,ror#20 + vrev32.16 q11,q11 + add r9,r9,r12 + vadd.i32 q2,q2,q3 + mov r6,r6,ror#20 + vadd.i32 q6,q6,q7 + eor r5,r5,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r6,r6,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r5 + veor q13,q5,q6 + mov r10,r10,ror#24 + veor q14,q9,q10 + add r1,r1,r6 + vshr.u32 q1,q12,#20 + mov r12,r12,ror#24 + vshr.u32 q5,q13,#20 + eor r10,r10,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r12,r12,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r10 + vsli.32 q5,q13,#12 + mov r5,r5,ror#25 + vsli.32 q9,q14,#12 + str r10,[sp,#4*(16+15)] + vadd.i32 q0,q0,q1 + ldr r10,[sp,#4*(16+13)] + vadd.i32 q4,q4,q5 + add r9,r9,r12 + vadd.i32 q8,q8,q9 + mov r6,r6,ror#25 + veor q12,q3,q0 + eor r5,r5,r8,ror#25 + veor q13,q7,q4 + eor r6,r6,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+10)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+8)] + vshr.u32 q7,q13,#24 + add r2,r2,r7 + vshr.u32 q11,q14,#24 + mov r10,r10,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+11)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+9)] + vsli.32 q11,q14,#8 + add r3,r3,r4 + vadd.i32 q2,q2,q3 + mov r14,r14,ror#16 + vadd.i32 q6,q6,q7 + eor r10,r10,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r14,r14,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r10 + veor q13,q5,q6 + mov r7,r7,ror#20 + veor q14,q9,q10 + add r9,r9,r14 + vshr.u32 q1,q12,#25 + mov r4,r4,ror#20 + vshr.u32 q5,q13,#25 + eor r7,r7,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r4,r4,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r7 + vsli.32 q5,q13,#7 + mov r10,r10,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r4 + vext.8 q2,q2,q2,#8 + mov r14,r14,ror#24 + vext.8 q6,q6,q6,#8 + eor r10,r10,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r14,r14,r3,ror#24 + vext.8 q1,q1,q1,#12 + add r8,r8,r10 + vext.8 q5,q5,q5,#12 + mov r7,r7,ror#25 + vext.8 q9,q9,q9,#12 + add r9,r9,r14 + vext.8 q3,q3,q3,#4 + mov r4,r4,ror#25 + vext.8 q7,q7,q7,#4 + eor r7,r7,r8,ror#25 + vext.8 q11,q11,q11,#4 + eor r4,r4,r9,ror#25 + bne Loop_neon + + add r11,sp,#32 + vld1.32 {q12,q13},[sp] @ load key material + vld1.32 {q14,q15},[r11] + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 q0,q0,q12 @ accumulate key material + vadd.i32 q4,q4,q12 + vadd.i32 q8,q8,q12 + vldr d24,[sp,#4*(16+0)] @ one + + vadd.i32 q1,q1,q13 + vadd.i32 q5,q5,q13 + vadd.i32 q9,q9,q13 + vldr d26,[sp,#4*(16+2)] @ two + + vadd.i32 q2,q2,q14 + vadd.i32 q6,q6,q14 + vadd.i32 q10,q10,q14 + vadd.i32 d14,d14,d24 @ counter+1 + vadd.i32 d22,d22,d26 @ counter+2 + + vadd.i32 q3,q3,q15 + vadd.i32 q7,q7,q15 + vadd.i32 q11,q11,q15 + + cmp r11,#64*4 + blo Ltail_neon + + vld1.8 {q12,q13},[r12]! @ load input + mov r11,sp + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 @ xor with input + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + vst1.8 {q0,q1},[r14]! @ store output + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vld1.32 {q0,q1},[r11]! @ load for next iteration + veor d25,d25,d25 + vldr d24,[sp,#4*(16+4)] @ four + veor q9,q9,q13 + vld1.32 {q2,q3},[r11] + veor q10,q10,q14 + vst1.8 {q4,q5},[r14]! + veor q11,q11,q15 + vst1.8 {q6,q7},[r14]! + + vadd.i32 d6,d6,d24 @ next counter value + vldr d24,[sp,#4*(16+0)] @ one + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + vst1.8 {q8,q9},[r14]! + add r1,r1,r9 + ldr r9,[r12,#-12] + vst1.8 {q10,q11},[r14]! + add r2,r2,r10 + ldr r10,[r12,#-8] + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 @ xor with input + add r8,sp,#4*(4) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r5,r5,r9 + ldr r9,[r12,#-12] + add r6,r6,r10 + ldr r10,[r12,#-8] + add r7,r7,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 + add r8,sp,#4*(8) + eor r5,r5,r9 + str r4,[r14],#16 @ store output + eor r6,r6,r10 + str r5,[r14,#-12] + eor r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r1,r1,r9 + ldr r9,[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + add r2,r2,r10 + ldr r10,[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 + add r8,sp,#4*(12) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r8,r8,#4 @ next counter value + add r5,r5,r9 + str r8,[sp,#4*(12)] @ save next counter value + ldr r8,[r12],#16 @ load input + add r6,r6,r10 + add r4,r4,#3 @ counter+3 + ldr r9,[r12,#-12] + add r7,r7,r11 + ldr r10,[r12,#-8] + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 +# ifdef __thumb2__ + it hi +# endif + ldrhi r8,[sp,#4*(32+2)] @ re-load len + eor r5,r5,r9 + eor r6,r6,r10 + str r4,[r14],#16 @ store output + eor r7,r7,r11 + str r5,[r14,#-12] + sub r11,r8,#64*4 @ len-=64*4 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi Loop_neon_outer + + b Ldone_neon + +.align 4 +Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str r11, [sp,#4*(20+32+2)] @ save len + add r11,sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr r12,[sp,#4*(16+10)] + ldr r14,[sp,#4*(16+11)] + vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement + str r12,[sp,#4*(20+16+10)] @ copy "rx" + str r14,[sp,#4*(20+16+11)] @ copy "rx" + + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(20+16+15)] + add r11,sp,#4*(20) + vst1.32 {q0,q1},[r11]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {q2,q3},[r11] + mov r11,#10 + b Loop @ go integer-only + +.align 4 +Ltail_neon: + cmp r11,#64*3 + bhs L192_or_more_neon + cmp r11,#64*2 + bhs L128_or_more_neon + cmp r11,#64*1 + bhs L64_or_more_neon + + add r8,sp,#4*(8) + vst1.8 {q0,q1},[sp] + add r10,sp,#4*(0) + vst1.8 {q2,q3},[r8] + b Loop_tail_neon + +.align 4 +L64_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + veor q2,q2,q14 + veor q3,q3,q15 + vst1.8 {q0,q1},[r14]! + vst1.8 {q2,q3},[r14]! + + beq Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q4,q5},[sp] + add r10,sp,#4*(0) + vst1.8 {q6,q7},[r8] + sub r11,r11,#64*1 @ len-=64*1 + b Loop_tail_neon + +.align 4 +L128_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vst1.8 {q0,q1},[r14]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vst1.8 {q4,q5},[r14]! + vst1.8 {q6,q7},[r14]! + + beq Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q8,q9},[sp] + add r10,sp,#4*(0) + vst1.8 {q10,q11},[r8] + sub r11,r11,#64*2 @ len-=64*2 + b Loop_tail_neon + +.align 4 +L192_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q0,q1},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vst1.8 {q2,q3},[r14]! + veor q9,q9,q13 + vst1.8 {q4,q5},[r14]! + veor q10,q10,q14 + vst1.8 {q6,q7},[r14]! + veor q11,q11,q15 + vst1.8 {q8,q9},[r14]! + vst1.8 {q10,q11},[r14]! + + beq Ldone_neon + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(4) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r6,r6,r10 + add r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} + add r0,sp,#4*(16+8) + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(12) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r4,r4,#3 @ counter+3 + add r6,r6,r10 + add r7,r7,r11 + ldr r11,[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} + add r10,sp,#4*(0) + sub r11,r11,#64*3 @ len-=64*3 + +Loop_tail_neon: + ldrb r8,[r10],#1 @ read buffer on stack + ldrb r9,[r12],#1 @ read input + subs r11,r11,#1 + eor r8,r8,r9 + strb r8,[r14],#1 @ store output + bne Loop_tail_neon + +Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} + +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S new file mode 100644 index 0000000000..63e2ec7163 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S @@ -0,0 +1,1233 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +#ifndef __KERNEL__ +# include +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES +@ instructions are in aesv8-armx.pl.) + + +.text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else +.code 32 +#undef __thumb2__ +#endif + + +.align 5 +AES_Te: +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a +@ Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +@ rcon[] +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 + + +@ void aes_nohw_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl _aes_nohw_encrypt +.private_extern _aes_nohw_encrypt +#ifdef __thumb2__ +.thumb_func _aes_nohw_encrypt +#endif +.align 5 +_aes_nohw_encrypt: +#ifndef __thumb2__ + sub r3,pc,#8 @ _aes_nohw_encrypt +#else + adr r3,. +#endif + stmdb sp!,{r1,r4-r12,lr} +#if defined(__thumb2__) || defined(__APPLE__) + adr r10,AES_Te +#else + sub r10,r3,#_aes_nohw_encrypt-AES_Te @ Te +#endif + mov r12,r0 @ inp + mov r11,r2 +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_encrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +#ifdef __thumb2__ +.thumb_func _armv4_AES_encrypt +#endif +.align 2 +_armv4_AES_encrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4,r5,r6,r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0 + and r8,lr,r0,lsr#8 + and r9,lr,r0,lsr#16 + mov r0,r0,lsr#24 +Lenc_loop: + ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] + and r8,lr,r1 + ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] + ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] + ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] + eor r0,r0,r7,ror#8 + ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,ror#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r6,r9,ror#8 + and r9,lr,r2 + ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] + eor r1,r1,r4,ror#24 + ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,ror#8 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r6,r9,ror#16 + and r9,lr,r3,lsr#16 @ i2 + ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] + eor r2,r2,r5,ror#16 + ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] + eor r0,r0,r7,ror#24 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] + eor r2,r2,r9,ror#8 + ldr r4,[r11,#-12] + eor r3,r3,r6,ror#8 + + ldr r5,[r11,#-8] + eor r0,r0,r7 + ldr r6,[r11,#-4] + and r7,lr,r0 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0,lsr#16 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne Lenc_loop + + add r10,r10,#2 + + ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] + and r8,lr,r1 + ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] + and r9,lr,r1,lsr#8 + ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] + mov r1,r1,lsr#24 + + ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] + ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] + ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] + eor r0,r7,r0,lsl#8 + ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,lsl#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] + eor r1,r4,r1,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] + mov r2,r2,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] + eor r0,r7,r0,lsl#8 + ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r3,lsr#16 @ i2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] + eor r2,r5,r2,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] + mov r3,r3,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] + eor r0,r7,r0,lsl#8 + ldr r7,[r11,#0] + ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r2,r9,lsl#16 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#2 + ldr pc,[sp],#4 @ pop and return + + +.globl _aes_nohw_set_encrypt_key +.private_extern _aes_nohw_set_encrypt_key +#ifdef __thumb2__ +.thumb_func _aes_nohw_set_encrypt_key +#endif +.align 5 +_aes_nohw_set_encrypt_key: +_armv4_AES_set_encrypt_key: +#ifndef __thumb2__ + sub r3,pc,#8 @ _aes_nohw_set_encrypt_key +#else + adr r3,. +#endif + teq r0,#0 +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq Labrt + teq r2,#0 +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq Labrt + + teq r1,#128 + beq Lok + teq r1,#192 + beq Lok + teq r1,#256 +#ifdef __thumb2__ + itt ne @ Thumb2 thing, sanity check in ARM +#endif + movne r0,#-1 + bne Labrt + +Lok: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + mov r12,r0 @ inp + mov lr,r1 @ bits + mov r11,r2 @ key + +#if defined(__thumb2__) || defined(__APPLE__) + adr r10,AES_Te+1024 @ Te4 +#else + sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 +#endif + +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + str r0,[r11],#16 + orr r3,r3,r5,lsl#16 + str r1,[r11,#-12] + orr r3,r3,r6,lsl#24 + str r2,[r11,#-8] + str r3,[r11,#-4] +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r11],#16 + str r1,[r11,#-12] + str r2,[r11,#-8] + str r3,[r11,#-4] +#endif + + teq lr,#128 + bne Lnot128 + mov r12,#10 + str r12,[r11,#240-16] + add r6,r10,#256 @ rcon + mov lr,#255 + +L128_loop: + and r5,lr,r3,lsr#24 + and r7,lr,r3,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r3 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r5,r5,r4 + eor r0,r0,r5 @ rk[4]=rk[0]^... + eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] + str r0,[r11],#16 + eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] + str r1,[r11,#-12] + eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] + str r2,[r11,#-8] + subs r12,r12,#1 + str r3,[r11,#-4] + bne L128_loop + sub r2,r11,#176 + b Ldone + +Lnot128: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#19] + ldrb r4,[r12,#18] + ldrb r5,[r12,#17] + ldrb r6,[r12,#16] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#23] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#22] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#21] + ldrb r6,[r12,#20] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#16] + ldr r9,[r12,#20] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + teq lr,#192 + bne Lnot192 + mov r12,#12 + str r12,[r11,#240-24] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#8 + +L192_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[6]=rk[0]^... + eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] + str r0,[r11],#24 + eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] + str r1,[r11,#-20] + eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] + str r2,[r11,#-16] + subs r12,r12,#1 + str r3,[r11,#-12] +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#216 + beq Ldone + + ldr r7,[r11,#-32] + ldr r8,[r11,#-28] + eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] + eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] + str r7,[r11,#-8] + str r9,[r11,#-4] + b L192_loop + +Lnot192: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#27] + ldrb r4,[r12,#26] + ldrb r5,[r12,#25] + ldrb r6,[r12,#24] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#31] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#30] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#29] + ldrb r6,[r12,#28] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#24] + ldr r9,[r12,#28] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + mov r12,#14 + str r12,[r11,#240-32] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#7 + +L256_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[8]=rk[0]^... + eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] + str r0,[r11],#32 + eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] + str r1,[r11,#-28] + eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] + str r2,[r11,#-24] + subs r12,r12,#1 + str r3,[r11,#-20] +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#256 + beq Ldone + + and r5,lr,r3 + and r7,lr,r3,lsr#8 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#16 + ldrb r7,[r10,r7] + and r9,lr,r3,lsr#24 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#8 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r11,#-48] + orr r5,r5,r9,lsl#24 + + ldr r7,[r11,#-44] + ldr r8,[r11,#-40] + eor r4,r4,r5 @ rk[12]=rk[4]^... + ldr r9,[r11,#-36] + eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] + str r4,[r11,#-16] + eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] + str r7,[r11,#-12] + eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] + str r8,[r11,#-8] + str r9,[r11,#-4] + b L256_loop + +.align 2 +Ldone: mov r0,#0 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} +Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +.globl _aes_nohw_set_decrypt_key +.private_extern _aes_nohw_set_decrypt_key +#ifdef __thumb2__ +.thumb_func _aes_nohw_set_decrypt_key +#endif +.align 5 +_aes_nohw_set_decrypt_key: + str lr,[sp,#-4]! @ push lr + bl _armv4_AES_set_encrypt_key + teq r0,#0 + ldr lr,[sp],#4 @ pop lr + bne Labrt + + mov r0,r2 @ _aes_nohw_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key + + +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.globl _AES_set_enc2dec_key +.private_extern _AES_set_enc2dec_key +#ifdef __thumb2__ +.thumb_func _AES_set_enc2dec_key +#endif +.align 5 +_AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + ldr r12,[r0,#240] + mov r7,r0 @ input + add r8,r0,r12,lsl#4 + mov r11,r1 @ output + add r10,r1,r12,lsl#4 + str r12,[r1,#240] + +Linv: ldr r0,[r7],#16 + ldr r1,[r7,#-12] + ldr r2,[r7,#-8] + ldr r3,[r7,#-4] + ldr r4,[r8],#-16 + ldr r5,[r8,#16+4] + ldr r6,[r8,#16+8] + ldr r9,[r8,#16+12] + str r0,[r10],#-16 + str r1,[r10,#16+4] + str r2,[r10,#16+8] + str r3,[r10,#16+12] + str r4,[r11],#16 + str r5,[r11,#-12] + str r6,[r11,#-8] + str r9,[r11,#-4] + teq r7,r8 + bne Linv + + ldr r0,[r7] + ldr r1,[r7,#4] + ldr r2,[r7,#8] + ldr r3,[r7,#12] + str r0,[r11] + str r1,[r11,#4] + str r2,[r11,#8] + str r3,[r11,#12] + sub r11,r11,r12,lsl#3 + ldr r0,[r11,#16]! @ prefetch tp1 + mov r7,#0x80 + mov r8,#0x1b + orr r7,r7,#0x8000 + orr r8,r8,#0x1b00 + orr r7,r7,r7,lsl#16 + orr r8,r8,r8,lsl#16 + sub r12,r12,#1 + mvn r9,r7 + mov r12,r12,lsl#2 @ (rounds-1)*4 + +Lmix: and r4,r0,r7 + and r1,r0,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r1,r4,r1,lsl#1 @ tp2 + + and r4,r1,r7 + and r2,r1,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r2,r4,r2,lsl#1 @ tp4 + + and r4,r2,r7 + and r3,r2,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r3,r4,r3,lsl#1 @ tp8 + + eor r4,r1,r2 + eor r5,r0,r3 @ tp9 + eor r4,r4,r3 @ tpe + eor r4,r4,r1,ror#24 + eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) + eor r4,r4,r2,ror#16 + eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) + eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) + + ldr r0,[r11,#4] @ prefetch tp1 + str r4,[r11],#4 + subs r12,r12,#1 + bne Lmix + + mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + + +.align 5 +AES_Td: +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 +@ Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + + +@ void aes_nohw_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl _aes_nohw_decrypt +.private_extern _aes_nohw_decrypt +#ifdef __thumb2__ +.thumb_func _aes_nohw_decrypt +#endif +.align 5 +_aes_nohw_decrypt: +#ifndef __thumb2__ + sub r3,pc,#8 @ _aes_nohw_decrypt +#else + adr r3,. +#endif + stmdb sp!,{r1,r4-r12,lr} +#if defined(__thumb2__) || defined(__APPLE__) + adr r10,AES_Td +#else + sub r10,r3,#_aes_nohw_decrypt-AES_Td @ Td +#endif + mov r12,r0 @ inp + mov r11,r2 +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_decrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +#ifdef __thumb2__ +.thumb_func _armv4_AES_decrypt +#endif +.align 2 +_armv4_AES_decrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4,r5,r6,r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0,lsr#16 + and r8,lr,r0,lsr#8 + and r9,lr,r0 + mov r0,r0,lsr#24 +Ldec_loop: + ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] + and r7,lr,r1 @ i0 + ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] + and r8,lr,r1,lsr#16 + ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] + ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] + ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] + eor r0,r0,r7,ror#24 + ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,ror#8 + and r8,lr,r2 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r2,lsr#16 + ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] + eor r1,r1,r4,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] + and r7,lr,r3,lsr#16 @ i0 + eor r1,r1,r8,ror#24 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r3 @ i2 + ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] + eor r2,r2,r5,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] + eor r0,r0,r7,ror#8 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] + eor r2,r2,r9,ror#24 + + ldr r4,[r11,#-12] + eor r0,r0,r7 + ldr r5,[r11,#-8] + eor r3,r3,r6,ror#8 + ldr r6,[r11,#-4] + and r7,lr,r0,lsr#16 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne Ldec_loop + + add r10,r10,#1024 + + ldr r5,[r10,#0] @ prefetch Td4 + ldr r6,[r10,#32] + ldr r4,[r10,#64] + ldr r5,[r10,#96] + ldr r6,[r10,#128] + ldr r4,[r10,#160] + ldr r5,[r10,#192] + ldr r6,[r10,#224] + + ldrb r0,[r10,r0] @ Td4[s0>>24] + ldrb r4,[r10,r7] @ Td4[s0>>16] + and r7,lr,r1 @ i0 + ldrb r5,[r10,r8] @ Td4[s0>>8] + and r8,lr,r1,lsr#16 + ldrb r6,[r10,r9] @ Td4[s0>>0] + and r9,lr,r1,lsr#8 + + add r1,r10,r1,lsr#24 + ldrb r7,[r10,r7] @ Td4[s1>>0] + ldrb r1,[r1] @ Td4[s1>>24] + ldrb r8,[r10,r8] @ Td4[s1>>16] + eor r0,r7,r0,lsl#24 + ldrb r9,[r10,r9] @ Td4[s1>>8] + eor r1,r4,r1,lsl#8 + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,lsl#8 + and r8,lr,r2 @ i1 + ldrb r7,[r10,r7] @ Td4[s2>>8] + eor r6,r6,r9,lsl#8 + ldrb r8,[r10,r8] @ Td4[s2>>0] + and r9,lr,r2,lsr#16 + + add r2,r10,r2,lsr#24 + ldrb r2,[r2] @ Td4[s2>>24] + eor r0,r0,r7,lsl#8 + ldrb r9,[r10,r9] @ Td4[s2>>16] + eor r1,r8,r1,lsl#16 + and r7,lr,r3,lsr#16 @ i0 + eor r2,r5,r2,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + ldrb r7,[r10,r7] @ Td4[s3>>16] + eor r6,r6,r9,lsl#16 + ldrb r8,[r10,r8] @ Td4[s3>>8] + and r9,lr,r3 @ i2 + + add r3,r10,r3,lsr#24 + ldrb r9,[r10,r9] @ Td4[s3>>0] + ldrb r3,[r3] @ Td4[s3>>24] + eor r0,r0,r7,lsl#16 + ldr r7,[r11,#0] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r9,r2,lsl#8 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#1024 + ldr pc,[sp],#4 @ pop and return + +.byte 65,69,83,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S new file mode 100644 index 0000000000..7392231df2 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S @@ -0,0 +1,790 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +#if __ARM_MAX_ARCH__>=7 +.text + + +.code 32 +#undef __thumb2__ +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key +#ifdef __thumb2__ +.thumb_func _aes_hw_set_encrypt_key +#endif +.align 5 +_aes_hw_set_encrypt_key: +Lenc_key: + mov r3,#-1 + cmp r0,#0 + beq Lenc_key_abort + cmp r2,#0 + beq Lenc_key_abort + mov r3,#-2 + cmp r1,#128 + blt Lenc_key_abort + cmp r1,#256 + bgt Lenc_key_abort + tst r1,#0x3f + bne Lenc_key_abort + + adr r3,Lrcon + cmp r1,#192 + + veor q0,q0,q0 + vld1.8 {q3},[r0]! + mov r1,#8 @ reuse r1 + vld1.32 {q1,q2},[r3]! + + blt Loop128 + beq L192 + b L256 + +.align 4 +Loop128: + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + bne Loop128 + + vld1.32 {q1},[r3] + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + veor q3,q3,q10 + vst1.32 {q3},[r2] + add r2,r2,#0x50 + + mov r12,#10 + b Ldone + +.align 4 +L192: + vld1.8 {d16},[r0]! + vmov.i8 q10,#8 @ borrow q10 + vst1.32 {q3},[r2]! + vsub.i8 q2,q2,q10 @ adjust the mask + +Loop192: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {d16},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + + vdup.32 q9,d7[1] + veor q9,q9,q8 + veor q10,q10,q1 + vext.8 q8,q0,q8,#12 + vshl.u8 q1,q1,#1 + veor q8,q8,q9 + veor q3,q3,q10 + veor q8,q8,q10 + vst1.32 {q3},[r2]! + bne Loop192 + + mov r12,#12 + add r2,r2,#0x20 + b Ldone + +.align 4 +L256: + vld1.8 {q8},[r0] + mov r1,#7 + mov r12,#14 + vst1.32 {q3},[r2]! + +Loop256: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q8},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + vst1.32 {q3},[r2]! + beq Ldone + + vdup.32 q10,d7[1] + vext.8 q9,q0,q8,#12 +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + + veor q8,q8,q10 + b Loop256 + +Ldone: + str r12,[r2] + mov r3,#0 + +Lenc_key_abort: + mov r0,r3 @ return value + + bx lr + + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key +#ifdef __thumb2__ +.thumb_func _aes_hw_set_decrypt_key +#endif +.align 5 +_aes_hw_set_decrypt_key: + stmdb sp!,{r4,lr} + bl Lenc_key + + cmp r0,#0 + bne Ldec_key_abort + + sub r2,r2,#240 @ restore original r2 + mov r4,#-16 + add r0,r2,r12,lsl#4 @ end of key schedule + + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + +Loop_imc: + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + cmp r0,r2 + bhi Loop_imc + + vld1.32 {q0},[r2] +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vst1.32 {q0},[r0] + + eor r0,r0,r0 @ return value +Ldec_key_abort: + ldmia sp!,{r4,pc} + +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt +#ifdef __thumb2__ +.thumb_func _aes_hw_encrypt +#endif +.align 5 +_aes_hw_encrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +Loop_enc: +.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2]! + subs r3,r3,#2 +.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q1},[r2]! + bgt Loop_enc + +.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2] +.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt +#ifdef __thumb2__ +.thumb_func _aes_hw_decrypt +#endif +.align 5 +_aes_hw_decrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +Loop_dec: +.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2]! + subs r3,r3,#2 +.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q1},[r2]! + bgt Loop_dec + +.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2] +.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt +#ifdef __thumb2__ +.thumb_func _aes_hw_cbc_encrypt +#endif +.align 5 +_aes_hw_cbc_encrypt: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo Lcbc_abort + moveq r8,#0 + + cmp r5,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q6},[r4] + vld1.8 {q0},[r0],r8 + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10,q11},[r7]! + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq Lcbc_dec + + cmp r5,#2 + veor q0,q0,q6 + veor q5,q8,q7 + beq Lcbc_enc128 + + vld1.32 {q2,q3},[r7] + add r7,r3,#16 + add r6,r3,#16*4 + add r12,r3,#16*5 +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r14,r3,#16*6 + add r3,r3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +Lenter_cbc_enc: +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r6] + cmp r5,#4 +.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r12] + beq Lcbc_enc192 + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r14] +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r3] + nop + +Lcbc_enc192: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 +.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r7] @ re-pre-load rndkey[1] +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs Loop_cbc_enc + + vst1.8 {q6},[r1]! + b Lcbc_done + +.align 5 +Lcbc_enc128: + vld1.32 {q2,q3},[r7] +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + b Lenter_cbc_enc128 +Loop_cbc_enc128: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +Lenter_cbc_enc128: +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 +.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 +.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs Loop_cbc_enc128 + + vst1.8 {q6},[r1]! + b Lcbc_done +.align 5 +Lcbc_dec: + vld1.8 {q10},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q0,q0 + vorr q1,q0,q0 + vorr q11,q10,q10 + blo Lcbc_dec_tail + + vorr q1,q10,q10 + vld1.8 {q10},[r0]! + vorr q2,q0,q0 + vorr q3,q1,q1 + vorr q11,q10,q10 + +Loop3x_cbc_dec: +.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt Loop3x_cbc_dec + +.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q4,q6,q7 + subs r2,r2,#0x30 + veor q5,q2,q7 + movlo r6,r2 @ r6, r6, is zero at this point +.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q9,q3,q7 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + vorr q6,q11,q11 + mov r7,r3 +.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q2},[r0]! +.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q3},[r0]! +.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q11},[r0]! +.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 +.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 +.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q4,q0 + veor q5,q5,q1 + veor q10,q10,q9 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs Loop3x_cbc_dec + + cmn r2,#0x30 + beq Lcbc_done + nop + +Lcbc_dec_tail: +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt Lcbc_dec_tail + +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 +.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + cmn r2,#0x20 +.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q5,q6,q7 +.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q9,q3,q7 +.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 +.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + beq Lcbc_dec_one + veor q5,q5,q1 + veor q9,q9,q10 + vorr q6,q11,q11 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b Lcbc_done + +Lcbc_dec_one: + veor q5,q5,q10 + vorr q6,q11,q11 + vst1.8 {q5},[r1]! + +Lcbc_done: + vst1.8 {q6},[r4] +Lcbc_abort: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,pc} + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks +#ifdef __thumb2__ +.thumb_func _aes_hw_ctr32_encrypt_blocks +#endif +.align 5 +_aes_hw_ctr32_encrypt_blocks: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg + ldr r5,[r3,#240] + + ldr r8, [r4, #12] + vld1.32 {q0},[r4] + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#4 + mov r12,#16 + cmp r2,#2 + add r7,r3,r5,lsl#4 @ pointer to last 5 round keys + sub r5,r5,#2 + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + add r7,r3,#32 + mov r6,r5 + movlo r12,#0 +#ifndef __ARMEB__ + rev r8, r8 +#endif + vorr q1,q0,q0 + add r10, r8, #1 + vorr q10,q0,q0 + add r8, r8, #2 + vorr q6,q0,q0 + rev r10, r10 + vmov.32 d3[1],r10 + bls Lctr32_tail + rev r12, r8 + sub r2,r2,#3 @ bias + vmov.32 d21[1],r12 + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt Loop3x_ctr32 + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + vld1.8 {q2},[r0]! + vorr q0,q6,q6 +.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.8 {q3},[r0]! + vorr q1,q6,q6 +.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.8 {q11},[r0]! + mov r7,r3 +.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 +.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 + vorr q10,q6,q6 + add r9,r8,#1 +.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q2,q2,q7 + add r10,r8,#2 +.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + veor q3,q3,q7 + add r8,r8,#3 +.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q11,q11,q7 + rev r9,r9 +.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + vmov.32 d1[1], r9 + rev r10,r10 +.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vmov.32 d3[1], r10 + rev r12,r8 +.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + vmov.32 d21[1], r12 + subs r2,r2,#3 +.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 +.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 +.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 + + veor q2,q2,q4 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vst1.8 {q2},[r1]! + veor q3,q3,q5 + mov r6,r5 + vst1.8 {q3},[r1]! + veor q11,q11,q9 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q11},[r1]! + bhs Loop3x_ctr32 + + adds r2,r2,#3 + beq Lctr32_done + cmp r2,#1 + mov r12,#16 + moveq r12,#0 + +Lctr32_tail: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q9},[r7]! + bgt Lctr32_tail + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q2},[r0],r12 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0] +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q2,q2,q7 +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q3,q3,q7 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 +.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + + cmp r2,#1 + veor q2,q2,q0 + veor q3,q3,q1 + vst1.8 {q2},[r1]! + beq Lctr32_done + vst1.8 {q3},[r1] + +Lctr32_done: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} + +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S new file mode 100644 index 0000000000..e549d1f163 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S @@ -0,0 +1,982 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. + + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +#if __ARM_MAX_ARCH__>=7 +.align 5 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lbn_mul_mont +#endif + +.globl _bn_mul_mont +.private_extern _bn_mul_mont +#ifdef __thumb2__ +.thumb_func _bn_mul_mont +#endif + +.align 5 +_bn_mul_mont: +Lbn_mul_mont: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block +#if __ARM_MAX_ARCH__>=7 + tst ip,#7 + bne Lialu + adr r0,Lbn_mul_mont + ldr r2,LOPENSSL_armcap + ldr r0,[r0,r2] +#ifdef __APPLE__ + ldr r0,[r0] +#endif + tst r0,#ARMV7_NEON @ NEON available? + ldmia sp, {r0,r2} + beq Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +Lialu: +#endif + cmp ip,#2 + mov r0,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif + movlt r0,#0 + addlt sp,sp,#2*4 + blt Labrt + + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + mov r7,sp + str r14,[r0,#4] @ tp[num]= + +Louter: + sub r7,r0,r7 @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 +#ifdef __thumb2__ + itt ne +#endif + movne r7,sp + bne Louter + + ldr r2,[r0,#12*4] @ pull rp + mov r5,sp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,r5 @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + +Lcopy: ldr r7,[r4] @ conditional copy + ldr r5,[r2] + str sp,[r4],#4 @ zap tp +#ifdef __thumb2__ + it cc +#endif + movcc r5,r7 + str r5,[r2],#4 + teq r4,r0 @ preserve carry + bne Lcopy + + mov sp,r0 + add sp,sp,#4 @ skip over tp[num+1] + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +Labrt: +#if __ARM_ARCH__>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +#ifdef __thumb2__ +.thumb_func bn_mul8x_mont_neon +#endif +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load rest of parameter block + mov ip,sp + + cmp r5,#8 + bhi LNEON_8n + + @ special case for r5==8, everything is in register bank... + + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + sub r7,sp,r5,lsl#4 + vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + vmul.u32 d29,d29,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4,d5,d6,d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b LNEON_outer8 + +.align 4 +LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d29,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#96 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b LNEON_tail_entry + +.align 4 +LNEON_8n: + veor q6,q6,q6 + sub r7,sp,#128 + veor q7,q7,q7 + sub r7,r7,r5,lsl#4 + veor q8,q8,q8 + and r7,r7,#-64 + veor q9,q9,q9 + mov sp,r7 @ alloca + veor q10,q10,q10 + add r7,r7,#256 + veor q11,q11,q11 + sub r8,r5,#8 + veor q12,q12,q12 + veor q13,q13,q13 + +LNEON_8n_init: + vst1.64 {q6,q7},[r7,:256]! + subs r8,r8,#8 + vst1.64 {q8,q9},[r7,:256]! + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12,q13},[r7,:256]! + bne LNEON_8n_init + + add r6,sp,#256 + vld1.32 {d0,d1,d2,d3},[r1]! + add r10,sp,#8 + vld1.32 {d30[0]},[r4,:32] + mov r9,r5 + b LNEON_8n_outer + +.align 4 +LNEON_8n_outer: + vld1.32 {d28[0]},[r2,:32]! @ *b++ + veor d8,d8,d8 + vzip.16 d28,d8 + add r7,sp,#128 + vld1.32 {d4,d5,d6,d7},[r3]! + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + vadd.u64 d29,d29,d12 + vmlal.u32 q10,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q11,d28,d2[1] + vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q6,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q7,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q8,d29,d5[0] + vshr.u64 d12,d12,#16 + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vadd.u64 d12,d12,d13 + vmlal.u32 q11,d29,d6[1] + vshr.u64 d12,d12,#16 + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vadd.u64 d14,d14,d12 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128]! + vmlal.u32 q8,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q9,d28,d1[0] + vshl.i64 d29,d15,#16 + vmlal.u32 q10,d28,d1[1] + vadd.u64 d29,d29,d14 + vmlal.u32 q11,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q12,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] + vmlal.u32 q13,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q7,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q8,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q9,d29,d5[0] + vshr.u64 d14,d14,#16 + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vadd.u64 d14,d14,d15 + vmlal.u32 q12,d29,d6[1] + vshr.u64 d14,d14,#16 + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vadd.u64 d16,d16,d14 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128]! + vmlal.u32 q9,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q10,d28,d1[0] + vshl.i64 d29,d17,#16 + vmlal.u32 q11,d28,d1[1] + vadd.u64 d29,d29,d16 + vmlal.u32 q12,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q13,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] + vmlal.u32 q6,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q8,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q9,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q10,d29,d5[0] + vshr.u64 d16,d16,#16 + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vadd.u64 d16,d16,d17 + vmlal.u32 q13,d29,d6[1] + vshr.u64 d16,d16,#16 + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vadd.u64 d18,d18,d16 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128]! + vmlal.u32 q10,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q11,d28,d1[0] + vshl.i64 d29,d19,#16 + vmlal.u32 q12,d28,d1[1] + vadd.u64 d29,d29,d18 + vmlal.u32 q13,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q6,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] + vmlal.u32 q7,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q9,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q10,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q11,d29,d5[0] + vshr.u64 d18,d18,#16 + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vadd.u64 d18,d18,d19 + vmlal.u32 q6,d29,d6[1] + vshr.u64 d18,d18,#16 + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vadd.u64 d20,d20,d18 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128]! + vmlal.u32 q11,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q12,d28,d1[0] + vshl.i64 d29,d21,#16 + vmlal.u32 q13,d28,d1[1] + vadd.u64 d29,d29,d20 + vmlal.u32 q6,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q7,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] + vmlal.u32 q8,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q10,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q11,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q12,d29,d5[0] + vshr.u64 d20,d20,#16 + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vadd.u64 d20,d20,d21 + vmlal.u32 q7,d29,d6[1] + vshr.u64 d20,d20,#16 + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vadd.u64 d22,d22,d20 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128]! + vmlal.u32 q12,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q13,d28,d1[0] + vshl.i64 d29,d23,#16 + vmlal.u32 q6,d28,d1[1] + vadd.u64 d29,d29,d22 + vmlal.u32 q7,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q8,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] + vmlal.u32 q9,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q11,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q12,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q13,d29,d5[0] + vshr.u64 d22,d22,#16 + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vadd.u64 d22,d22,d23 + vmlal.u32 q8,d29,d6[1] + vshr.u64 d22,d22,#16 + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vadd.u64 d24,d24,d22 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128]! + vmlal.u32 q13,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q6,d28,d1[0] + vshl.i64 d29,d25,#16 + vmlal.u32 q7,d28,d1[1] + vadd.u64 d29,d29,d24 + vmlal.u32 q8,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q9,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] + vmlal.u32 q10,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q12,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q13,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q6,d29,d5[0] + vshr.u64 d24,d24,#16 + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vadd.u64 d24,d24,d25 + vmlal.u32 q9,d29,d6[1] + vshr.u64 d24,d24,#16 + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vadd.u64 d26,d26,d24 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128]! + vmlal.u32 q6,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q7,d28,d1[0] + vshl.i64 d29,d27,#16 + vmlal.u32 q8,d28,d1[1] + vadd.u64 d29,d29,d26 + vmlal.u32 q9,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q10,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] + vmlal.u32 q11,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q12,d28,d3[1] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q13,d29,d4[0] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d29,d5[0] + vshr.u64 d26,d26,#16 + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vadd.u64 d26,d26,d27 + vmlal.u32 q10,d29,d6[1] + vshr.u64 d26,d26,#16 + vmlal.u32 q11,d29,d7[0] + vmlal.u32 q12,d29,d7[1] + vadd.u64 d12,d12,d26 + vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] + add r10,sp,#8 @ rewind + sub r8,r5,#8 + b LNEON_8n_inner + +.align 4 +LNEON_8n_inner: + subs r8,r8,#8 + vmlal.u32 q6,d28,d0[0] + vld1.64 {q13},[r6,:128] + vmlal.u32 q7,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] + vmlal.u32 q8,d28,d1[0] + vld1.32 {d4,d5,d6,d7},[r3]! + vmlal.u32 q9,d28,d1[1] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vmlal.u32 q11,d29,d6[1] + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q6},[r7,:128]! + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128] + vmlal.u32 q8,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] + vmlal.u32 q9,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d1[1] + vmlal.u32 q11,d28,d2[0] + vmlal.u32 q12,d28,d2[1] + vmlal.u32 q13,d28,d3[0] + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] + vmlal.u32 q7,d29,d4[0] + vmlal.u32 q8,d29,d4[1] + vmlal.u32 q9,d29,d5[0] + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vmlal.u32 q12,d29,d6[1] + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vst1.64 {q7},[r7,:128]! + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128] + vmlal.u32 q9,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] + vmlal.u32 q10,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q11,d28,d1[1] + vmlal.u32 q12,d28,d2[0] + vmlal.u32 q13,d28,d2[1] + vmlal.u32 q6,d28,d3[0] + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] + vmlal.u32 q8,d29,d4[0] + vmlal.u32 q9,d29,d4[1] + vmlal.u32 q10,d29,d5[0] + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vmlal.u32 q13,d29,d6[1] + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vst1.64 {q8},[r7,:128]! + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128] + vmlal.u32 q10,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] + vmlal.u32 q11,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q12,d28,d1[1] + vmlal.u32 q13,d28,d2[0] + vmlal.u32 q6,d28,d2[1] + vmlal.u32 q7,d28,d3[0] + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] + vmlal.u32 q9,d29,d4[0] + vmlal.u32 q10,d29,d4[1] + vmlal.u32 q11,d29,d5[0] + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vmlal.u32 q6,d29,d6[1] + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vst1.64 {q9},[r7,:128]! + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128] + vmlal.u32 q11,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] + vmlal.u32 q12,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q13,d28,d1[1] + vmlal.u32 q6,d28,d2[0] + vmlal.u32 q7,d28,d2[1] + vmlal.u32 q8,d28,d3[0] + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] + vmlal.u32 q10,d29,d4[0] + vmlal.u32 q11,d29,d4[1] + vmlal.u32 q12,d29,d5[0] + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vmlal.u32 q7,d29,d6[1] + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vst1.64 {q10},[r7,:128]! + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128] + vmlal.u32 q12,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] + vmlal.u32 q13,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q6,d28,d1[1] + vmlal.u32 q7,d28,d2[0] + vmlal.u32 q8,d28,d2[1] + vmlal.u32 q9,d28,d3[0] + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] + vmlal.u32 q11,d29,d4[0] + vmlal.u32 q12,d29,d4[1] + vmlal.u32 q13,d29,d5[0] + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vmlal.u32 q8,d29,d6[1] + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vst1.64 {q11},[r7,:128]! + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128] + vmlal.u32 q13,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] + vmlal.u32 q6,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q7,d28,d1[1] + vmlal.u32 q8,d28,d2[0] + vmlal.u32 q9,d28,d2[1] + vmlal.u32 q10,d28,d3[0] + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] + vmlal.u32 q12,d29,d4[0] + vmlal.u32 q13,d29,d4[1] + vmlal.u32 q6,d29,d5[0] + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vmlal.u32 q9,d29,d6[1] + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vst1.64 {q12},[r7,:128]! + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128] + vmlal.u32 q6,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] + vmlal.u32 q7,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q8,d28,d1[1] + vmlal.u32 q9,d28,d2[0] + vmlal.u32 q10,d28,d2[1] + vmlal.u32 q11,d28,d3[0] + vmlal.u32 q12,d28,d3[1] + it eq + subeq r1,r1,r5,lsl#2 @ rewind + vmlal.u32 q13,d29,d4[0] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q6,d29,d4[1] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q7,d29,d5[0] + add r10,sp,#8 @ rewind + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vmlal.u32 q10,d29,d6[1] + vmlal.u32 q11,d29,d7[0] + vst1.64 {q13},[r7,:128]! + vmlal.u32 q12,d29,d7[1] + + bne LNEON_8n_inner + add r6,sp,#128 + vst1.64 {q6,q7},[r7,:256]! + veor q2,q2,q2 @ d4-d5 + vst1.64 {q8,q9},[r7,:256]! + veor q3,q3,q3 @ d6-d7 + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12},[r7,:128] + + subs r9,r9,#8 + vld1.64 {q6,q7},[r6,:256]! + vld1.64 {q8,q9},[r6,:256]! + vld1.64 {q10,q11},[r6,:256]! + vld1.64 {q12,q13},[r6,:256]! + + itt ne + subne r3,r3,r5,lsl#2 @ rewind + bne LNEON_8n_outer + + add r7,sp,#128 + vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 d10,d12,#16 + vst1.64 {q2,q3},[sp,:256]! + vadd.u64 d13,d13,d10 + vst1.64 {q2,q3}, [sp,:256]! + vshr.u64 d10,d13,#16 + vst1.64 {q2,q3}, [sp,:256]! + vzip.16 d12,d13 + + mov r8,r5 + b LNEON_tail_entry + +.align 4 +LNEON_tail: + vadd.u64 d12,d12,d10 + vshr.u64 d10,d12,#16 + vld1.64 {q8,q9}, [r6, :256]! + vadd.u64 d13,d13,d10 + vld1.64 {q10,q11}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q12,q13}, [r6, :256]! + vzip.16 d12,d13 + +LNEON_tail_entry: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + vld1.64 {q6,q7}, [r6, :256]! + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + bne LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +LNEON_sub: + ldmia r1!, {r4,r5,r6,r7} + ldmia r3!, {r8,r9,r10,r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne LNEON_sub + + ldr r10, [r1] @ load top-most bit + mov r11,sp + veor q0,q0,q0 + sub r11,r2,r11 @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +LNEON_copy_n_zap: + ldmia r1!, {r4,r5,r6,r7} + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + ldmia r1, {r4,r5,r6,r7} + stmia r0!, {r8,r9,r10,r11} + sub r1,r1,#16 + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r1,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne LNEON_copy_n_zap + + mov sp,ip + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + bx lr @ bx lr + +#endif +.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S new file mode 100644 index 0000000000..8329a8c202 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S @@ -0,0 +1,1536 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel +@ of Linaro. Permission to use under GPL terms is granted. +@ ==================================================================== + +@ Bit-sliced AES for ARM NEON +@ +@ February 2012. +@ +@ This implementation is direct adaptation of bsaes-x86_64 module for +@ ARM NEON. Except that this module is endian-neutral [in sense that +@ it can be compiled for either endianness] by courtesy of vld1.8's +@ neutrality. Initial version doesn't implement interface to OpenSSL, +@ only low-level primitives and unsupported entry points, just enough +@ to collect performance results, which for Cortex-A8 core are: +@ +@ encrypt 19.5 cycles per byte processed with 128-bit key +@ decrypt 22.1 cycles per byte processed with 128-bit key +@ key conv. 440 cycles per 128-bit key/0.18 of 8x block +@ +@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +@ which is [much] worse than anticipated (for further details see +@ http://www.openssl.org/~appro/Snapdragon-S4.html). +@ +@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +@ manages in 20.0 cycles]. +@ +@ When comparing to x86_64 results keep in mind that NEON unit is +@ [mostly] single-issue and thus can't [fully] benefit from +@ instruction-level parallelism. And when comparing to aes-armv4 +@ results keep in mind key schedule conversion overhead (see +@ bsaes-x86_64.pl for further details)... +@ +@ + +@ April-August 2013 +@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. + +#ifndef __KERNEL__ +# include + +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#if defined(__thumb2__) && !defined(__APPLE__) +.thumb +#else +.code 32 +# undef __thumb2__ +#endif + +#ifdef __thumb2__ +.thumb_func _bsaes_decrypt8 +#endif +.align 4 +_bsaes_decrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,LM0ISR +#else + add r6,r6,#LM0ISR-_bsaes_decrypt8 +#endif + + vldmia r6!, {q8} @ LM0ISR + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b Ldec_sbox +.align 4 +Ldec_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +Ldec_sbox: + veor q1, q1, q4 + veor q3, q3, q4 + + veor q4, q4, q7 + veor q1, q1, q6 + veor q2, q2, q7 + veor q6, q6, q4 + + veor q0, q0, q1 + veor q2, q2, q5 + veor q7, q7, q6 + veor q3, q3, q0 + veor q5, q5, q0 + veor q1, q1, q3 + veor q11, q3, q0 + veor q10, q7, q4 + veor q9, q1, q6 + veor q13, q4, q0 + vmov q8, q10 + veor q12, q5, q2 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q6, q2 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q3, q7 + veor q12, q1, q5 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q4, q6 + veor q9, q9, q14 + vand q13, q0, q2 + vand q14, q7, q1 + vorr q15, q3, q5 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q5, q2 + veor q8, q1, q6 + veor q10, q15, q14 + vand q10, q10, q5 + veor q5, q5, q1 + vand q11, q1, q15 + vand q5, q5, q14 + veor q1, q11, q10 + veor q5, q5, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q2 + veor q12, q12, q8 + veor q2, q2, q6 + vand q8, q8, q15 + vand q6, q6, q13 + vand q12, q12, q14 + vand q2, q2, q9 + veor q8, q8, q12 + veor q2, q2, q6 + veor q12, q12, q11 + veor q6, q6, q10 + veor q5, q5, q12 + veor q2, q2, q12 + veor q1, q1, q8 + veor q6, q6, q8 + + veor q12, q3, q0 + veor q8, q7, q4 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q4 + vand q8, q8, q15 + vand q4, q4, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q4 + veor q12, q12, q11 + veor q4, q4, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q3 + veor q3, q3, q7 + vand q11, q7, q15 + vand q3, q3, q14 + veor q7, q11, q10 + veor q3, q3, q11 + veor q3, q3, q12 + veor q0, q0, q12 + veor q7, q7, q8 + veor q4, q4, q8 + veor q1, q1, q7 + veor q6, q6, q5 + + veor q4, q4, q1 + veor q2, q2, q7 + veor q5, q5, q7 + veor q4, q4, q2 + veor q7, q7, q0 + veor q4, q4, q5 + veor q3, q3, q6 + veor q6, q6, q1 + veor q3, q3, q4 + + veor q4, q4, q0 + veor q7, q7, q3 + subs r5,r5,#1 + bcc Ldec_done + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 q8, q0, q0, #8 + vext.8 q14, q3, q3, #8 + vext.8 q15, q5, q5, #8 + veor q8, q8, q0 + vext.8 q9, q1, q1, #8 + veor q14, q14, q3 + vext.8 q10, q6, q6, #8 + veor q15, q15, q5 + vext.8 q11, q4, q4, #8 + veor q9, q9, q1 + vext.8 q12, q2, q2, #8 + veor q10, q10, q6 + vext.8 q13, q7, q7, #8 + veor q11, q11, q4 + veor q12, q12, q2 + veor q13, q13, q7 + + veor q0, q0, q14 + veor q1, q1, q14 + veor q6, q6, q8 + veor q2, q2, q10 + veor q4, q4, q9 + veor q1, q1, q15 + veor q6, q6, q15 + veor q2, q2, q14 + veor q7, q7, q11 + veor q4, q4, q14 + veor q3, q3, q12 + veor q2, q2, q15 + veor q7, q7, q15 + veor q5, q5, q13 + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q6, q6, #12 + veor q1, q1, q9 + vext.8 q11, q4, q4, #12 + veor q6, q6, q10 + vext.8 q12, q2, q2, #12 + veor q4, q4, q11 + vext.8 q13, q7, q7, #12 + veor q2, q2, q12 + vext.8 q14, q3, q3, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q3, q3, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q2 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q2, q2, #8 + veor q12, q12, q4 + vext.8 q9, q7, q7, #8 + veor q15, q15, q3 + vext.8 q2, q4, q4, #8 + veor q11, q11, q6 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q4, q3, q3, #8 + veor q11, q11, q5 + vext.8 q3, q6, q6, #8 + veor q5, q9, q13 + veor q11, q11, q2 + veor q7, q7, q15 + veor q6, q4, q14 + veor q4, q8, q12 + veor q2, q3, q10 + vmov q3, q11 + @ vmov q5, q9 + vldmia r6, {q12} @ LISR + ite eq @ Thumb2 thing, sanity check in ARM + addeq r6,r6,#0x10 + bne Ldec_loop + vldmia r6, {q12} @ LISRM0 + b Ldec_loop +.align 4 +Ldec_done: + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q3, #1 + vshr.u64 q11, q2, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q4 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q4, q4, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q2, #2 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q3, q3, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q4 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q4, q4, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q4, #4 + vshr.u64 q11, q6, #4 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q4, q4, q10 + veor q6, q6, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q6, q6, q8 + veor q4, q4, q8 + veor q2, q2, q8 + veor q7, q7, q8 + veor q3, q3, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr + + + +.align 6 +_bsaes_const: +LM0ISR:@ InvShiftRows constants +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d +LM0SR:@ ShiftRows constants +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d +LM0: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 6 + + +#ifdef __thumb2__ +.thumb_func _bsaes_encrypt8 +#endif +.align 4 +_bsaes_encrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,LM0SR +#else + sub r6,r6,#_bsaes_encrypt8-LM0SR +#endif + + vldmia r6!, {q8} @ LM0SR +_bsaes_encrypt8_alt: + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 +_bsaes_encrypt8_bitslice: + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b Lenc_sbox +.align 4 +Lenc_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +Lenc_sbox: + veor q2, q2, q1 + veor q5, q5, q6 + veor q3, q3, q0 + veor q6, q6, q2 + veor q5, q5, q0 + + veor q6, q6, q3 + veor q3, q3, q7 + veor q7, q7, q5 + veor q3, q3, q4 + veor q4, q4, q5 + + veor q2, q2, q7 + veor q3, q3, q1 + veor q1, q1, q5 + veor q11, q7, q4 + veor q10, q1, q2 + veor q9, q5, q3 + veor q13, q2, q4 + vmov q8, q10 + veor q12, q6, q0 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q3, q0 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q7, q1 + veor q12, q5, q6 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q2, q3 + veor q9, q9, q14 + vand q13, q4, q0 + vand q14, q1, q5 + vorr q15, q7, q6 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q6, q0 + veor q8, q5, q3 + veor q10, q15, q14 + vand q10, q10, q6 + veor q6, q6, q5 + vand q11, q5, q15 + vand q6, q6, q14 + veor q5, q11, q10 + veor q6, q6, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q3 + vand q8, q8, q15 + vand q3, q3, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q3 + veor q12, q12, q11 + veor q3, q3, q10 + veor q6, q6, q12 + veor q0, q0, q12 + veor q5, q5, q8 + veor q3, q3, q8 + + veor q12, q7, q4 + veor q8, q1, q2 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q4 + veor q12, q12, q8 + veor q4, q4, q2 + vand q8, q8, q15 + vand q2, q2, q13 + vand q12, q12, q14 + vand q4, q4, q9 + veor q8, q8, q12 + veor q4, q4, q2 + veor q12, q12, q11 + veor q2, q2, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q7 + veor q7, q7, q1 + vand q11, q1, q15 + vand q7, q7, q14 + veor q1, q11, q10 + veor q7, q7, q11 + veor q7, q7, q12 + veor q4, q4, q12 + veor q1, q1, q8 + veor q2, q2, q8 + veor q7, q7, q0 + veor q1, q1, q6 + veor q6, q6, q0 + veor q4, q4, q7 + veor q0, q0, q1 + + veor q1, q1, q5 + veor q5, q5, q2 + veor q2, q2, q3 + veor q3, q3, q5 + veor q4, q4, q5 + + veor q6, q6, q3 + subs r5,r5,#1 + bcc Lenc_done + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q4, q4, #12 + veor q1, q1, q9 + vext.8 q11, q6, q6, #12 + veor q4, q4, q10 + vext.8 q12, q3, q3, #12 + veor q6, q6, q11 + vext.8 q13, q7, q7, #12 + veor q3, q3, q12 + vext.8 q14, q2, q2, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q2, q2, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q3 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q3, q3, #8 + veor q12, q12, q6 + vext.8 q9, q7, q7, #8 + veor q15, q15, q2 + vext.8 q3, q6, q6, #8 + veor q11, q11, q4 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q6, q2, q2, #8 + veor q11, q11, q5 + vext.8 q2, q4, q4, #8 + veor q5, q9, q13 + veor q4, q8, q12 + veor q3, q3, q11 + veor q7, q7, q15 + veor q6, q6, q14 + @ vmov q4, q8 + veor q2, q2, q10 + @ vmov q5, q9 + vldmia r6, {q12} @ LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq r6,r6,#0x10 + bne Lenc_loop + vldmia r6, {q12} @ LSRM0 + b Lenc_loop +.align 4 +Lenc_done: + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q3, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q3, q3, q11 + vshr.u64 q10, q4, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q6 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q6, q6, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q4, q4, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q3, #2 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q3, q3, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q6 + veor q11, q11, q4 + vand q10, q10, q9 + vand q11, q11, q9 + veor q6, q6, q10 + vshl.u64 q10, q10, #2 + veor q4, q4, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q6, #4 + vshr.u64 q11, q4, #4 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q4, q4, q8 + veor q6, q6, q8 + veor q3, q3, q8 + veor q7, q7, q8 + veor q2, q2, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr + +#ifdef __thumb2__ +.thumb_func _bsaes_key_convert +#endif +.align 4 +_bsaes_key_convert: + adr r6,. + vld1.8 {q7}, [r4]! @ load round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,LM0 +#else + sub r6,r6,#_bsaes_key_convert-LM0 +#endif + vld1.8 {q15}, [r4]! @ load round 1 key + + vmov.i8 q8, #0x01 @ bit masks + vmov.i8 q9, #0x02 + vmov.i8 q10, #0x04 + vmov.i8 q11, #0x08 + vmov.i8 q12, #0x10 + vmov.i8 q13, #0x20 + vldmia r6, {q14} @ LM0 + +#ifdef __ARMEL__ + vrev32.8 q7, q7 + vrev32.8 q15, q15 +#endif + sub r5,r5,#1 + vstmia r12!, {q7} @ save round 0 key + b Lkey_loop + +.align 4 +Lkey_loop: + vtbl.8 d14,{q15},d28 + vtbl.8 d15,{q15},d29 + vmov.i8 q6, #0x40 + vmov.i8 q15, #0x80 + + vtst.8 q0, q7, q8 + vtst.8 q1, q7, q9 + vtst.8 q2, q7, q10 + vtst.8 q3, q7, q11 + vtst.8 q4, q7, q12 + vtst.8 q5, q7, q13 + vtst.8 q6, q7, q6 + vtst.8 q7, q7, q15 + vld1.8 {q15}, [r4]! @ load next round key + vmvn q0, q0 @ "pnot" + vmvn q1, q1 + vmvn q5, q5 + vmvn q6, q6 +#ifdef __ARMEL__ + vrev32.8 q15, q15 +#endif + subs r5,r5,#1 + vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key + bne Lkey_loop + + vmov.i8 q7,#0x63 @ compose L63 + @ don't save last round key + bx lr + +.globl _bsaes_cbc_encrypt +.private_extern _bsaes_cbc_encrypt +#ifdef __thumb2__ +.thumb_func _bsaes_cbc_encrypt +#endif +.align 5 +_bsaes_cbc_encrypt: + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. + + @ it is up to the caller to make sure we are called with enc == 0 + + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ IV is 1st arg on the stack + mov r2, r2, lsr#4 @ len in 16 byte blocks + sub sp, #0x10 @ scratch space to carry over the IV + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ sifze of bit-slices key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + vldmia sp, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia sp, {q7} +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, r3, #248 + vldmia r4, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia r4, {q7} + +.align 2 + +#endif + + vld1.8 {q15}, [r8] @ load IV + b Lcbc_dec_loop + +.align 4 +Lcbc_dec_loop: + subs r2, r2, #0x8 + bmi Lcbc_dec_loop_finish + + vld1.8 {q0,q1}, [r0]! @ load input + vld1.8 {q2,q3}, [r0]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + vld1.8 {q4,q5}, [r0]! + mov r5, r10 + vld1.8 {q6,q7}, [r0] + sub r0, r0, #0x60 + vstmia r9, {q15} @ put aside IV + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q14,q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + veor q5, q5, q14 + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + + b Lcbc_dec_loop + +Lcbc_dec_loop_finish: + adds r2, r2, #8 + beq Lcbc_dec_done + + @ Set up most parameters for the _bsaes_decrypt8 call. +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + mov r5, r10 + vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo Lcbc_dec_one + vld1.8 {q1}, [r0]! + beq Lcbc_dec_two + vld1.8 {q2}, [r0]! + cmp r2, #4 + blo Lcbc_dec_three + vld1.8 {q3}, [r0]! + beq Lcbc_dec_four + vld1.8 {q4}, [r0]! + cmp r2, #6 + blo Lcbc_dec_five + vld1.8 {q5}, [r0]! + beq Lcbc_dec_six + vld1.8 {q6}, [r0]! + sub r0, r0, #0x70 + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_six: + sub r0, r0, #0x60 + bl _bsaes_decrypt8 + vldmia r9,{q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_five: + sub r0, r0, #0x50 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0,q1}, [r1]! @ write output + veor q2, q2, q11 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_four: + sub r0, r0, #0x40 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_three: + sub r0, r0, #0x30 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_two: + sub r0, r0, #0x20 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! @ reload input + veor q1, q1, q8 + vst1.8 {q0,q1}, [r1]! @ write output + b Lcbc_dec_done +.align 4 +Lcbc_dec_one: + sub r0, r0, #0x10 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output + +Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY + vmov.i32 q0, #0 + vmov.i32 q1, #0 +Lcbc_dec_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne Lcbc_dec_bzero +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + vst1.8 {q15}, [r8] @ return IV + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} + +.globl _bsaes_ctr32_encrypt_blocks +.private_extern _bsaes_ctr32_encrypt_blocks +#ifdef __thumb2__ +.thumb_func _bsaes_ctr32_encrypt_blocks +#endif +.align 5 +_bsaes_ctr32_encrypt_blocks: + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + + vld1.8 {q0}, [r8] @ load counter +#ifdef __APPLE__ + mov r8, #:lower16:(LREVM0SR-LM0) + add r8, r6, r8 +#else + add r8, r6, #LREVM0SR-LM0 @ borrow r8 +#endif + vldmia sp, {q4} @ load round0 key +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + +.align 2 + add r12, r3, #248 + vld1.8 {q0}, [r8] @ load counter + adrl r8, LREVM0SR @ borrow r8 + vldmia r12, {q4} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 q8,#1 @ compose 1<<96 + veor q9,q9,q9 + vrev32.8 q0,q0 + vext.8 q8,q9,q8,#4 + vrev32.8 q4,q4 + vadd.u32 q9,q8,q8 @ compose 2<<96 + vstmia sp, {q4} @ save adjusted round0 key + b Lctr_enc_loop + +.align 4 +Lctr_enc_loop: + vadd.u32 q10, q8, q9 @ compose 3<<96 + vadd.u32 q1, q0, q8 @ +1 + vadd.u32 q2, q0, q9 @ +2 + vadd.u32 q3, q0, q10 @ +3 + vadd.u32 q4, q1, q10 + vadd.u32 q5, q2, q10 + vadd.u32 q6, q3, q10 + vadd.u32 q7, q4, q10 + vadd.u32 q10, q5, q10 @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia sp, {q9} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x10 @ pass next round key +#else + add r4, r3, #264 +#endif + vldmia r8, {q8} @ LREVM0SR + mov r5, r10 @ pass rounds + vstmia r9, {q10} @ save next counter +#ifdef __APPLE__ + mov r6, #:lower16:(LREVM0SR-LSR) + sub r6, r8, r6 +#else + sub r6, r8, #LREVM0SR-LSR @ pass constants +#endif + + bl _bsaes_encrypt8_alt + + subs r2, r2, #8 + blo Lctr_enc_loop_done + + vld1.8 {q8,q9}, [r0]! @ load input + vld1.8 {q10,q11}, [r0]! + veor q0, q8 + veor q1, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q10 + veor q6, q11 + vld1.8 {q14,q15}, [r0]! + veor q3, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q7, q13 + veor q2, q14 + vst1.8 {q4}, [r1]! + veor q5, q15 + vst1.8 {q6}, [r1]! + vmov.i32 q8, #1 @ compose 1<<96 + vst1.8 {q3}, [r1]! + veor q9, q9, q9 + vst1.8 {q7}, [r1]! + vext.8 q8, q9, q8, #4 + vst1.8 {q2}, [r1]! + vadd.u32 q9,q8,q8 @ compose 2<<96 + vst1.8 {q5}, [r1]! + vldmia r9, {q0} @ load counter + + bne Lctr_enc_loop + b Lctr_enc_done + +.align 4 +Lctr_enc_loop_done: + add r2, r2, #8 + vld1.8 {q8}, [r0]! @ load input + veor q0, q8 + vst1.8 {q0}, [r1]! @ write output + cmp r2, #2 + blo Lctr_enc_done + vld1.8 {q9}, [r0]! + veor q1, q9 + vst1.8 {q1}, [r1]! + beq Lctr_enc_done + vld1.8 {q10}, [r0]! + veor q4, q10 + vst1.8 {q4}, [r1]! + cmp r2, #4 + blo Lctr_enc_done + vld1.8 {q11}, [r0]! + veor q6, q11 + vst1.8 {q6}, [r1]! + beq Lctr_enc_done + vld1.8 {q12}, [r0]! + veor q3, q12 + vst1.8 {q3}, [r1]! + cmp r2, #6 + blo Lctr_enc_done + vld1.8 {q13}, [r0]! + veor q7, q13 + vst1.8 {q7}, [r1]! + beq Lctr_enc_done + vld1.8 {q14}, [r0] + veor q2, q14 + vst1.8 {q2}, [r1]! + +Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +Lctr_enc_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne Lctr_enc_bzero +#else + vstmia sp, {q0,q1} +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return + + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. + +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S new file mode 100644 index 0000000000..fccd57d30e --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S @@ -0,0 +1,600 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) + + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + + +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 + + +#ifdef __thumb2__ +.thumb_func rem_4bit_get +#endif +rem_4bit_get: +#if defined(__thumb2__) + adr r2,rem_4bit +#else + sub r2,pc,#8+32 @ &rem_4bit +#endif + b Lrem_4bit_got + nop + nop + + +.globl _gcm_ghash_4bit +.private_extern _gcm_ghash_4bit +#ifdef __thumb2__ +.thumb_func _gcm_ghash_4bit +#endif +.align 4 +_gcm_ghash_4bit: +#if defined(__thumb2__) + adr r12,rem_4bit +#else + sub r12,pc,#8+48 @ &rem_4bit +#endif + add r3,r2,r3 @ r3 to point at the end + stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too + + ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack + + ldrb r12,[r2,#15] + ldrb r14,[r0,#15] +Louter: + eor r12,r12,r14 + and r14,r12,#0xf0 + and r12,r12,#0x0f + mov r3,#14 + + add r7,r1,r12,lsl#4 + ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] + add r11,r1,r14 + ldrb r12,[r2,#14] + + and r14,r4,#0xf @ rem + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + add r14,r14,r14 + eor r4,r8,r4,lsr#4 + ldrh r8,[sp,r14] @ rem_4bit[rem] + eor r4,r4,r5,lsl#28 + ldrb r14,[r0,#14] + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + eor r12,r12,r14 + and r14,r12,#0xf0 + and r12,r12,#0x0f + eor r7,r7,r8,lsl#16 + +Linner: + add r11,r1,r12,lsl#4 + and r12,r4,#0xf @ rem + subs r3,r3,#1 + add r12,r12,r12 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] + eor r4,r8,r4,lsr#4 + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + ldrh r8,[sp,r12] @ rem_4bit[rem] + eor r6,r10,r6,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb r12,[r2,r3] + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + + add r11,r1,r14 + and r14,r4,#0xf @ rem + eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] + add r14,r14,r14 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + eor r4,r8,r4,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb r8,[r0,r3] + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + ldrh r9,[sp,r14] + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 +#ifdef __thumb2__ + it pl +#endif + eorpl r12,r12,r8 + eor r7,r11,r7,lsr#4 +#ifdef __thumb2__ + itt pl +#endif + andpl r14,r12,#0xf0 + andpl r12,r12,#0x0f + eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] + bpl Linner + + ldr r3,[sp,#32] @ re-load r3/end + add r2,r2,#16 + mov r14,r4 +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r4,r4 + str r4,[r0,#12] +#elif defined(__ARMEB__) + str r4,[r0,#12] +#else + mov r9,r4,lsr#8 + strb r4,[r0,#12+3] + mov r10,r4,lsr#16 + strb r9,[r0,#12+2] + mov r11,r4,lsr#24 + strb r10,[r0,#12+1] + strb r11,[r0,#12] +#endif + cmp r2,r3 +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r5,r5 + str r5,[r0,#8] +#elif defined(__ARMEB__) + str r5,[r0,#8] +#else + mov r9,r5,lsr#8 + strb r5,[r0,#8+3] + mov r10,r5,lsr#16 + strb r9,[r0,#8+2] + mov r11,r5,lsr#24 + strb r10,[r0,#8+1] + strb r11,[r0,#8] +#endif + +#ifdef __thumb2__ + it ne +#endif + ldrneb r12,[r2,#15] +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r6,r6 + str r6,[r0,#4] +#elif defined(__ARMEB__) + str r6,[r0,#4] +#else + mov r9,r6,lsr#8 + strb r6,[r0,#4+3] + mov r10,r6,lsr#16 + strb r9,[r0,#4+2] + mov r11,r6,lsr#24 + strb r10,[r0,#4+1] + strb r11,[r0,#4] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r7,r7 + str r7,[r0,#0] +#elif defined(__ARMEB__) + str r7,[r0,#0] +#else + mov r9,r7,lsr#8 + strb r7,[r0,#0+3] + mov r10,r7,lsr#16 + strb r9,[r0,#0+2] + mov r11,r7,lsr#24 + strb r10,[r0,#0+1] + strb r11,[r0,#0] +#endif + + bne Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +.globl _gcm_gmult_4bit +.private_extern _gcm_gmult_4bit +#ifdef __thumb2__ +.thumb_func _gcm_gmult_4bit +#endif +_gcm_gmult_4bit: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + ldrb r12,[r0,#15] + b rem_4bit_get +Lrem_4bit_got: + and r14,r12,#0xf0 + and r12,r12,#0x0f + mov r3,#14 + + add r7,r1,r12,lsl#4 + ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] + ldrb r12,[r0,#14] + + add r11,r1,r14 + and r14,r4,#0xf @ rem + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + add r14,r14,r14 + eor r4,r8,r4,lsr#4 + ldrh r8,[r2,r14] @ rem_4bit[rem] + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + and r14,r12,#0xf0 + eor r7,r7,r8,lsl#16 + and r12,r12,#0x0f + +Loop: + add r11,r1,r12,lsl#4 + and r12,r4,#0xf @ rem + subs r3,r3,#1 + add r12,r12,r12 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] + eor r4,r8,r4,lsr#4 + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + ldrh r8,[r2,r12] @ rem_4bit[rem] + eor r6,r10,r6,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb r12,[r0,r3] + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + + add r11,r1,r14 + and r14,r4,#0xf @ rem + eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] + add r14,r14,r14 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + eor r4,r8,r4,lsr#4 + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + ldrh r8,[r2,r14] @ rem_4bit[rem] + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 +#ifdef __thumb2__ + itt pl +#endif + andpl r14,r12,#0xf0 + andpl r12,r12,#0x0f + eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] + bpl Loop +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r4,r4 + str r4,[r0,#12] +#elif defined(__ARMEB__) + str r4,[r0,#12] +#else + mov r9,r4,lsr#8 + strb r4,[r0,#12+3] + mov r10,r4,lsr#16 + strb r9,[r0,#12+2] + mov r11,r4,lsr#24 + strb r10,[r0,#12+1] + strb r11,[r0,#12] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r5,r5 + str r5,[r0,#8] +#elif defined(__ARMEB__) + str r5,[r0,#8] +#else + mov r9,r5,lsr#8 + strb r5,[r0,#8+3] + mov r10,r5,lsr#16 + strb r9,[r0,#8+2] + mov r11,r5,lsr#24 + strb r10,[r0,#8+1] + strb r11,[r0,#8] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r6,r6 + str r6,[r0,#4] +#elif defined(__ARMEB__) + str r6,[r0,#4] +#else + mov r9,r6,lsr#8 + strb r6,[r0,#4+3] + mov r10,r6,lsr#16 + strb r9,[r0,#4+2] + mov r11,r6,lsr#24 + strb r10,[r0,#4+1] + strb r11,[r0,#4] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r7,r7 + str r7,[r0,#0] +#elif defined(__ARMEB__) + str r7,[r0,#0] +#else + mov r9,r7,lsr#8 + strb r7,[r0,#0+3] + mov r10,r7,lsr#16 + strb r9,[r0,#0+2] + mov r11,r7,lsr#24 + strb r10,[r0,#0+1] + strb r11,[r0,#0] +#endif + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.globl _gcm_init_neon +.private_extern _gcm_init_neon +#ifdef __thumb2__ +.thumb_func _gcm_init_neon +#endif +.align 4 +_gcm_init_neon: + vld1.64 d7,[r1]! @ load H + vmov.i8 q8,#0xe1 + vld1.64 d6,[r1] + vshl.i64 d17,#57 + vshr.u64 d16,#63 @ t0=0xc2....01 + vdup.8 q9,d7[7] + vshr.u64 d26,d6,#63 + vshr.s8 q9,#7 @ broadcast carry bit + vshl.i64 q3,q3,#1 + vand q8,q8,q9 + vorr d7,d26 @ H<<<=1 + veor q3,q3,q8 @ twisted H + vstmia r0,{q3} + + bx lr @ bx lr + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon +#ifdef __thumb2__ +.thumb_func _gcm_gmult_neon +#endif +.align 4 +_gcm_gmult_neon: + vld1.64 d7,[r0]! @ load Xi + vld1.64 d6,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + mov r3,#16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon +#ifdef __thumb2__ +.thumb_func _gcm_ghash_neon +#endif +.align 4 +_gcm_ghash_neon: + vld1.64 d1,[r0]! @ load Xi + vld1.64 d0,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + +Loop_neon: + vld1.64 d7,[r2]! @ load inp + vld1.64 d6,[r2]! +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + veor q3,q0 @ inp^=Xi +Lgmult_neon: + vext.8 d16, d26, d26, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d0, d6, d6, #1 @ B1 + vmull.p8 q0, d26, d0 @ E = A*B1 + vext.8 d18, d26, d26, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d26, d22 @ G = A*B2 + vext.8 d20, d26, d26, #3 @ A3 + veor q8, q8, q0 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d0, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q0, d26, d0 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d26, d22 @ K = A*B4 + veor q10, q10, q0 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q0, d26, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q0, q0, q8 + veor q0, q0, q10 + veor d6,d6,d7 @ Karatsuba pre-processing + vext.8 d16, d28, d28, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d2, d6, d6, #1 @ B1 + vmull.p8 q1, d28, d2 @ E = A*B1 + vext.8 d18, d28, d28, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d28, d22 @ G = A*B2 + vext.8 d20, d28, d28, #3 @ A3 + veor q8, q8, q1 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d2, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q1, d28, d2 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d28, d22 @ K = A*B4 + veor q10, q10, q1 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q1, d28, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q1, q1, q8 + veor q1, q1, q10 + vext.8 d16, d27, d27, #1 @ A1 + vmull.p8 q8, d16, d7 @ F = A1*B + vext.8 d4, d7, d7, #1 @ B1 + vmull.p8 q2, d27, d4 @ E = A*B1 + vext.8 d18, d27, d27, #2 @ A2 + vmull.p8 q9, d18, d7 @ H = A2*B + vext.8 d22, d7, d7, #2 @ B2 + vmull.p8 q11, d27, d22 @ G = A*B2 + vext.8 d20, d27, d27, #3 @ A3 + veor q8, q8, q2 @ L = E + F + vmull.p8 q10, d20, d7 @ J = A3*B + vext.8 d4, d7, d7, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q2, d27, d4 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d7, d7, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d27, d22 @ K = A*B4 + veor q10, q10, q2 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q2, d27, d7 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q2, q2, q8 + veor q2, q2, q10 + veor q1,q1,q0 @ Karatsuba post-processing + veor q1,q1,q2 + veor d1,d1,d2 + veor d4,d4,d3 @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 q9,q0,#57 @ 1st phase + vshl.i64 q10,q0,#62 + veor q10,q10,q9 @ + vshl.i64 q9,q0,#63 + veor q10, q10, q9 @ + veor d1,d1,d20 @ + veor d4,d4,d21 + + vshr.u64 q10,q0,#1 @ 2nd phase + veor q2,q2,q0 + veor q0,q0,q10 @ + vshr.u64 q10,q10,#6 + vshr.u64 q0,q0,#1 @ + veor q0,q0,q2 @ + veor q0,q0,q10 @ + + subs r3,#16 + bne Loop_neon + +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + sub r0,#16 + vst1.64 d1,[r0]! @ write out Xi + vst1.64 d0,[r0] + + bx lr @ bx lr + +#endif +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S new file mode 100644 index 0000000000..f5de67f037 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S @@ -0,0 +1,256 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text + +.code 32 +#undef __thumb2__ +.globl _gcm_init_v8 +.private_extern _gcm_init_v8 +#ifdef __thumb2__ +.thumb_func _gcm_init_v8 +#endif +.align 4 +_gcm_init_v8: + vld1.64 {q9},[r1] @ load input H + vmov.i8 q11,#0xe1 + vshl.i64 q11,q11,#57 @ 0xc2.0 + vext.8 q3,q9,q9,#8 + vshr.u64 q10,q11,#63 + vdup.32 q9,d18[1] + vext.8 q8,q10,q11,#8 @ t0=0xc2....01 + vshr.u64 q10,q3,#63 + vshr.s32 q9,q9,#31 @ broadcast carry bit + vand q10,q10,q8 + vshl.i64 q3,q3,#1 + vext.8 q10,q10,q10,#8 + vand q8,q8,q9 + vorr q3,q3,q10 @ H<<<=1 + veor q12,q3,q8 @ twisted H + vst1.64 {q12},[r0]! @ store Htable[0] + + @ calculate H^2 + vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing +.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + veor q8,q8,q12 +.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 +.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q14,q0,q10 + + vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + veor q9,q9,q14 + vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed + vst1.64 {q13,q14},[r0] @ store Htable[1..2] + + bx lr + +.globl _gcm_gmult_v8 +.private_extern _gcm_gmult_v8 +#ifdef __thumb2__ +.thumb_func _gcm_gmult_v8 +#endif +.align 4 +_gcm_gmult_v8: + vld1.64 {q9},[r0] @ load Xi + vmov.i8 q11,#0xe1 + vld1.64 {q12,q13},[r1] @ load twisted H, ... + vshl.u64 q11,q11,#57 +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q3,q9,q9,#8 + +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + bx lr + +.globl _gcm_ghash_v8 +.private_extern _gcm_ghash_v8 +#ifdef __thumb2__ +.thumb_func _gcm_ghash_v8 +#endif +.align 4 +_gcm_ghash_v8: + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so + vld1.64 {q0},[r0] @ load [rotated] Xi + @ "[rotated]" means that + @ loaded value would have + @ to be rotated in order to + @ make it appear as in + @ algorithm specification + subs r3,r3,#32 @ see if r3 is 32 or larger + mov r12,#16 @ r12 is used as post- + @ increment for input pointer; + @ as loop is modulo-scheduled + @ r12 is zeroed just in time + @ to preclude overstepping + @ inp[len], which means that + @ last block[s] are actually + @ loaded twice, but last + @ copy is not processed + vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 + vmov.i8 q11,#0xe1 + vld1.64 {q14},[r1] + moveq r12,#0 @ is it time to zero r12? + vext.8 q0,q0,q0,#8 @ rotate Xi + vld1.64 {q8},[r2]! @ load [rotated] I[0] + vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant +#ifndef __ARMEB__ + vrev64.8 q8,q8 + vrev64.8 q0,q0 +#endif + vext.8 q3,q8,q8,#8 @ rotate I[0] + blo Lodd_tail_v8 @ r3 was less than 32 + vld1.64 {q9},[r2],r12 @ load [rotated] I[1] +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q7,q9,q9,#8 + veor q3,q3,q0 @ I[i]^=Xi +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q9,q9,q7 @ Karatsuba pre-processing +.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + vext.8 q10,q3,q3,#8 + subs r3,r3,#32 @ is there more data? +.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + movlo r12,#0 @ is it time to zero r12? + +.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + veor q10,q10,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + veor q0,q0,q4 @ accumulate +.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] + + veor q2,q2,q6 + moveq r12,#0 @ is it time to zero r12? + veor q1,q1,q5 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] +#ifndef __ARMEB__ + vrev64.8 q8,q8 +#endif + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + vext.8 q7,q9,q9,#8 + vext.8 q3,q8,q8,#8 + veor q0,q1,q10 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q3,q3,q2 @ accumulate q3 early + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q3,q3,q10 + veor q9,q9,q7 @ Karatsuba pre-processing + veor q3,q3,q0 +.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + bhs Loop_mod2x_v8 @ there was at least 32 more bytes + + veor q2,q2,q10 + vext.8 q3,q8,q8,#8 @ re-construct q3 + adds r3,r3,#32 @ re-construct r3 + veor q0,q0,q2 @ re-construct q0 + beq Ldone_v8 @ is r3 zero? +Lodd_tail_v8: + vext.8 q10,q0,q0,#8 + veor q3,q3,q0 @ inp^=Xi + veor q9,q8,q10 @ q9 is rotated inp^Xi + +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +Ldone_v8: +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so + bx lr + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S new file mode 100644 index 0000000000..82ac8df4fc --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S @@ -0,0 +1,1518 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl _sha1_block_data_order +.private_extern _sha1_block_data_order +#ifdef __thumb2__ +.thumb_func _sha1_block_data_order +#endif + +.align 5 +_sha1_block_data_order: +#if __ARM_MAX_ARCH__>=7 +Lsha1_block: + adr r3,Lsha1_block + ldr r12,LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA1 + bne LARMv8 + tst r12,#ARMV7_NEON + bne LNEON +#endif + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +Lloop: + ldr r8,LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp @ preserve carry +#endif + bne L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,LK_40_59 + sub sp,sp,#20*4 @ [+2] +L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b L_20_39_or_60_79 @ [+4], spare 300 bytes +L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +.align 5 +LK_00_19:.word 0x5a827999 +LK_20_39:.word 0x6ed9eba1 +LK_40_59:.word 0x8f1bbcdc +LK_60_79:.word 0xca62c1d6 +#if __ARM_MAX_ARCH__>=7 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lsha1_block +#endif +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 5 +#if __ARM_MAX_ARCH__>=7 + + + +#ifdef __thumb2__ +.thumb_func sha1_block_data_order_neon +#endif +.align 4 +sha1_block_data_order_neon: +LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov r14,sp + sub r12,sp,#64 + adr r8,LK_00_19 + bic r12,r12,#15 @ align for 128-bit stores + + ldmia r0,{r3,r4,r5,r6,r7} @ load context + mov sp,r12 @ alloca + + vld1.8 {q0,q1},[r1]! @ handles unaligned + veor q15,q15,q15 + vld1.8 {q2,q3},[r1]! + vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 + vrev32.8 q0,q0 @ yes, even on + vrev32.8 q1,q1 @ big-endian... + vrev32.8 q2,q2 + vadd.i32 q8,q0,q14 + vrev32.8 q3,q3 + vadd.i32 q9,q1,q14 + vst1.32 {q8},[r12,:128]! + vadd.i32 q10,q2,q14 + vst1.32 {q9},[r12,:128]! + vst1.32 {q10},[r12,:128]! + ldr r9,[sp] @ big RAW stall + +Loop_neon: + vext.8 q8,q0,q1,#8 + bic r10,r6,r4 + add r7,r7,r9 + and r11,r5,r4 + vadd.i32 q13,q3,q14 + ldr r9,[sp,#4] + add r7,r7,r3,ror#27 + vext.8 q12,q3,q15,#4 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q8,q8,q0 + bic r10,r5,r3 + add r6,r6,r9 + veor q12,q12,q2 + and r11,r4,r3 + ldr r9,[sp,#8] + veor q12,q12,q8 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q13,q15,q12,#4 + bic r10,r4,r7 + add r5,r5,r9 + vadd.i32 q8,q12,q12 + and r11,r3,r7 + ldr r9,[sp,#12] + vsri.32 q8,q12,#31 + add r5,r5,r6,ror#27 + eor r11,r11,r10 + mov r7,r7,ror#2 + vshr.u32 q12,q13,#30 + add r5,r5,r11 + bic r10,r3,r6 + vshl.u32 q13,q13,#2 + add r4,r4,r9 + and r11,r7,r6 + veor q8,q8,q12 + ldr r9,[sp,#16] + add r4,r4,r5,ror#27 + veor q8,q8,q13 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q9,q1,q2,#8 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + vadd.i32 q13,q8,q14 + ldr r9,[sp,#20] + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r4,ror#27 + vext.8 q12,q8,q15,#4 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + veor q9,q9,q1 + bic r10,r6,r4 + add r7,r7,r9 + veor q12,q12,q3 + and r11,r5,r4 + ldr r9,[sp,#24] + veor q12,q12,q9 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q13,q15,q12,#4 + bic r10,r5,r3 + add r6,r6,r9 + vadd.i32 q9,q12,q12 + and r11,r4,r3 + ldr r9,[sp,#28] + vsri.32 q9,q12,#31 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + mov r3,r3,ror#2 + vshr.u32 q12,q13,#30 + add r6,r6,r11 + bic r10,r4,r7 + vshl.u32 q13,q13,#2 + add r5,r5,r9 + and r11,r3,r7 + veor q9,q9,q12 + ldr r9,[sp,#32] + add r5,r5,r6,ror#27 + veor q9,q9,q13 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q10,q2,q3,#8 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + vadd.i32 q13,q9,q14 + ldr r9,[sp,#36] + add r4,r4,r5,ror#27 + vext.8 q12,q9,q15,#4 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q10,q10,q2 + bic r10,r7,r5 + add r3,r3,r9 + veor q12,q12,q8 + and r11,r6,r5 + ldr r9,[sp,#40] + veor q12,q12,q10 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q13,q15,q12,#4 + bic r10,r6,r4 + add r7,r7,r9 + vadd.i32 q10,q12,q12 + and r11,r5,r4 + ldr r9,[sp,#44] + vsri.32 q10,q12,#31 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + mov r4,r4,ror#2 + vshr.u32 q12,q13,#30 + add r7,r7,r11 + bic r10,r5,r3 + vshl.u32 q13,q13,#2 + add r6,r6,r9 + and r11,r4,r3 + veor q10,q10,q12 + ldr r9,[sp,#48] + add r6,r6,r7,ror#27 + veor q10,q10,q13 + eor r11,r11,r10 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q11,q3,q8,#8 + bic r10,r4,r7 + add r5,r5,r9 + and r11,r3,r7 + vadd.i32 q13,q10,q14 + ldr r9,[sp,#52] + add r5,r5,r6,ror#27 + vext.8 q12,q10,q15,#4 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q11,q11,q3 + bic r10,r3,r6 + add r4,r4,r9 + veor q12,q12,q9 + and r11,r7,r6 + ldr r9,[sp,#56] + veor q12,q12,q11 + add r4,r4,r5,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q13,q15,q12,#4 + bic r10,r7,r5 + add r3,r3,r9 + vadd.i32 q11,q12,q12 + and r11,r6,r5 + ldr r9,[sp,#60] + vsri.32 q11,q12,#31 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + vshr.u32 q12,q13,#30 + add r3,r3,r11 + bic r10,r6,r4 + vshl.u32 q13,q13,#2 + add r7,r7,r9 + and r11,r5,r4 + veor q11,q11,q12 + ldr r9,[sp,#0] + add r7,r7,r3,ror#27 + veor q11,q11,q13 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q10,q11,#8 + bic r10,r5,r3 + add r6,r6,r9 + and r11,r4,r3 + veor q0,q0,q8 + ldr r9,[sp,#4] + add r6,r6,r7,ror#27 + veor q0,q0,q1 + eor r11,r11,r10 + mov r3,r3,ror#2 + vadd.i32 q13,q11,q14 + add r6,r6,r11 + bic r10,r4,r7 + veor q12,q12,q0 + add r5,r5,r9 + and r11,r3,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r11,r10 + mov r7,r7,ror#2 + vsli.32 q0,q12,#2 + add r5,r5,r11 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + ldr r9,[sp,#12] + add r4,r4,r5,ror#27 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + ldr r9,[sp,#16] + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q11,q0,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#20] + veor q1,q1,q9 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q1,q1,q2 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q0,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q1 + ldr r9,[sp,#24] + eor r11,r10,r4 + vshr.u32 q1,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q1,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#28] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#32] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q0,q1,#8 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#36] + veor q2,q2,q10 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + veor q2,q2,q3 + mov r5,r5,ror#2 + add r3,r3,r11 + vadd.i32 q13,q1,q14 + eor r10,r4,r6 + vld1.32 {d28[],d29[]},[r8,:32]! + add r7,r7,r9 + veor q12,q12,q2 + ldr r9,[sp,#40] + eor r11,r10,r5 + vshr.u32 q2,q12,#30 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + vst1.32 {q13},[r12,:128]! + add r7,r7,r11 + eor r10,r3,r5 + vsli.32 q2,q12,#2 + add r6,r6,r9 + ldr r9,[sp,#44] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#48] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q1,q2,#8 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r7 + add r4,r4,r5,ror#27 + veor q3,q3,q8 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q2,q14 + eor r10,r5,r7 + add r3,r3,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r6 + vshr.u32 q3,q12,#30 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vst1.32 {q13},[r12,:128]! + add r3,r3,r11 + eor r10,r4,r6 + vsli.32 q3,q12,#2 + add r7,r7,r9 + ldr r9,[sp,#60] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#0] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q2,q3,#8 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#4] + veor q8,q8,q0 + eor r11,r10,r3 + add r5,r5,r6,ror#27 + veor q8,q8,q9 + mov r7,r7,ror#2 + add r5,r5,r11 + vadd.i32 q13,q3,q14 + eor r10,r6,r3 + add r4,r4,r9 + veor q12,q12,q8 + ldr r9,[sp,#8] + eor r11,r10,r7 + vshr.u32 q8,q12,#30 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + add r4,r4,r11 + eor r10,r5,r7 + vsli.32 q8,q12,#2 + add r3,r3,r9 + ldr r9,[sp,#12] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#16] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q3,q8,#8 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#20] + veor q9,q9,q1 + eor r11,r10,r4 + add r6,r6,r7,ror#27 + veor q9,q9,q10 + mov r3,r3,ror#2 + add r6,r6,r11 + vadd.i32 q13,q8,q14 + eor r10,r7,r4 + add r5,r5,r9 + veor q12,q12,q9 + ldr r9,[sp,#24] + eor r11,r10,r3 + vshr.u32 q9,q12,#30 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + vst1.32 {q13},[r12,:128]! + add r5,r5,r11 + eor r10,r6,r3 + vsli.32 q9,q12,#2 + add r4,r4,r9 + ldr r9,[sp,#28] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#32] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q8,q9,#8 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#36] + veor q10,q10,q2 + add r7,r7,r3,ror#27 + eor r11,r5,r6 + veor q10,q10,q11 + add r7,r7,r10 + and r11,r11,r4 + vadd.i32 q13,q9,q14 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q12,q12,q10 + add r6,r6,r9 + and r10,r4,r5 + vshr.u32 q10,q12,#30 + ldr r9,[sp,#40] + add r6,r6,r7,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r4,r5 + add r6,r6,r10 + vsli.32 q10,q12,#2 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#44] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#48] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q9,q10,#8 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#52] + veor q11,q11,q3 + add r3,r3,r4,ror#27 + eor r11,r6,r7 + veor q11,q11,q0 + add r3,r3,r10 + and r11,r11,r5 + vadd.i32 q13,q10,q14 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + veor q12,q12,q11 + add r7,r7,r9 + and r10,r5,r6 + vshr.u32 q11,q12,#30 + ldr r9,[sp,#56] + add r7,r7,r3,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r5,r6 + add r7,r7,r10 + vsli.32 q11,q12,#2 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#60] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#0] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q10,q11,#8 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#4] + veor q0,q0,q8 + add r4,r4,r5,ror#27 + eor r11,r7,r3 + veor q0,q0,q1 + add r4,r4,r10 + and r11,r11,r6 + vadd.i32 q13,q11,q14 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q12,q12,q0 + add r3,r3,r9 + and r10,r6,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r3,r3,r4,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r6,r7 + add r3,r3,r10 + vsli.32 q0,q12,#2 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#12] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#16] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q11,q0,#8 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#20] + veor q1,q1,q9 + add r5,r5,r6,ror#27 + eor r11,r3,r4 + veor q1,q1,q2 + add r5,r5,r10 + and r11,r11,r7 + vadd.i32 q13,q0,q14 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q12,q12,q1 + add r4,r4,r9 + and r10,r7,r3 + vshr.u32 q1,q12,#30 + ldr r9,[sp,#24] + add r4,r4,r5,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r7,r3 + add r4,r4,r10 + vsli.32 q1,q12,#2 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#28] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#32] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q0,q1,#8 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#36] + veor q2,q2,q10 + add r6,r6,r7,ror#27 + eor r11,r4,r5 + veor q2,q2,q3 + add r6,r6,r10 + and r11,r11,r3 + vadd.i32 q13,q1,q14 + mov r3,r3,ror#2 + add r6,r6,r11 + veor q12,q12,q2 + add r5,r5,r9 + and r10,r3,r4 + vshr.u32 q2,q12,#30 + ldr r9,[sp,#40] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r3,r4 + add r5,r5,r10 + vsli.32 q2,q12,#2 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#44] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#48] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q1,q2,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q3,q3,q8 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q2,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r4 + vshr.u32 q3,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q3,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#60] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#0] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q3,q14 + eor r10,r5,r7 + add r3,r3,r9 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + teq r1,r2 + sub r8,r8,#16 + it eq + subeq r1,r1,#64 + vld1.8 {q0,q1},[r1]! + ldr r9,[sp,#4] + eor r11,r10,r6 + vld1.8 {q2,q3},[r1]! + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + eor r10,r4,r6 + vrev32.8 q0,q0 + add r7,r7,r9 + ldr r9,[sp,#8] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#12] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#16] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vrev32.8 q1,q1 + eor r10,r6,r3 + add r4,r4,r9 + vadd.i32 q8,q0,q14 + ldr r9,[sp,#20] + eor r11,r10,r7 + vst1.32 {q8},[r12,:128]! + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#24] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#28] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#32] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vrev32.8 q2,q2 + eor r10,r7,r4 + add r5,r5,r9 + vadd.i32 q9,q1,q14 + ldr r9,[sp,#36] + eor r11,r10,r3 + vst1.32 {q9},[r12,:128]! + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#40] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#44] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#48] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vrev32.8 q3,q3 + eor r10,r3,r5 + add r6,r6,r9 + vadd.i32 q10,q2,q14 + ldr r9,[sp,#52] + eor r11,r10,r4 + vst1.32 {q10},[r12,:128]! + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#56] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#60] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + ldmia r0,{r9,r10,r11,r12} @ accumulate context + add r3,r3,r9 + ldr r9,[r0,#16] + add r4,r4,r10 + add r5,r5,r11 + add r6,r6,r12 + it eq + moveq sp,r14 + add r7,r7,r9 + it ne + ldrne r9,[sp] + stmia r0,{r3,r4,r5,r6,r7} + itt ne + addne r12,sp,#3*16 + bne Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} + +#endif +#if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + +#ifdef __thumb2__ +.thumb_func sha1_block_data_order_armv8 +#endif +.align 5 +sha1_block_data_order_armv8: +LARMv8: + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + + veor q1,q1,q1 + adr r3,LK_00_19 + vld1.32 {q0},[r0]! + vld1.32 {d2[0]},[r0] + sub r0,r0,#16 + vld1.32 {d16[],d17[]},[r3,:32]! + vld1.32 {d18[],d19[]},[r3,:32]! + vld1.32 {d20[],d21[]},[r3,:32]! + vld1.32 {d22[],d23[]},[r3,:32] + +Loop_v8: + vld1.8 {q4,q5},[r1]! + vld1.8 {q6,q7},[r1]! + vrev32.8 q4,q4 + vrev32.8 q5,q5 + + vadd.i32 q12,q8,q4 + vrev32.8 q6,q6 + vmov q14,q0 @ offload + subs r2,r2,#1 + + vadd.i32 q13,q8,q5 + vrev32.8 q7,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 + INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 + vadd.i32 q12,q8,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q8,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q8,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q9,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q9,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q10,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q10,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q11,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q11,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q11,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q7 + + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + + vadd.i32 q1,q1,q2 + vadd.i32 q0,q0,q14 + bne Loop_v8 + + vst1.32 {q0},[r0]! + vst1.32 {d2[0]},[r0] + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + bx lr @ bx lr + +#endif +#if __ARM_MAX_ARCH__>=7 +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S new file mode 100644 index 0000000000..0cf36482d4 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S @@ -0,0 +1,2846 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifndef __KERNEL__ +# include +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) + + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + + +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lsha256_block_data_order +#endif +.align 5 + +.globl _sha256_block_data_order +.private_extern _sha256_block_data_order +#ifdef __thumb2__ +.thumb_func _sha256_block_data_order +#endif +_sha256_block_data_order: +Lsha256_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ _sha256_block_data_order +#else + adr r3,Lsha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA256 + bne LARMv8 + tst r12,#ARMV7_NEON + bne LNEON +#endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.globl _sha256_block_data_order_neon +.private_extern _sha256_block_data_order_neon +#ifdef __thumb2__ +.thumb_func _sha256_block_data_order_neon +#endif +.align 5 +.skip 16 +_sha256_block_data_order_neon: +LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + sub r11,sp,#16*4+16 + adr r14,K256 + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b L_00_48 + +.align 4 +L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8,r9,r10,r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne L_00_48 + + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} + +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +#ifdef __thumb2__ +.thumb_func sha256_block_data_order_armv8 +#endif +.align 5 +sha256_block_data_order_armv8: +LARMv8: + vld1.32 {q0,q1},[r0] + sub r3,r3,#256+32 + add r2,r1,r2,lsl#6 @ len to point at the end of inp + b Loop_v8 + +.align 4 +Loop_v8: + vld1.8 {q8,q9},[r1]! + vld1.8 {q10,q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + it ne + bne Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr + +#endif +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S new file mode 100644 index 0000000000..21913cb2ba --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S @@ -0,0 +1,1899 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA512 block procedure for ARMv4. September 2007. + +@ This code is ~4.5 (four and a half) times faster than code generated +@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +@ Xscale PXA250 core]. +@ +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 6% improvement on +@ Cortex A8 core and ~40 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 7% +@ improvement on Coxtex A8 core and ~38 cycles per byte. + +@ March 2011. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process +@ one byte in 23.3 cycles or ~60% faster than integer-only code. + +@ August 2012. +@ +@ Improve NEON performance by 12% on Snapdragon S4. In absolute +@ terms it's 22.6 cycles per byte, which is disappointing result. +@ Technical writers asserted that 3-way S4 pipeline can sustain +@ multiple NEON instructions per cycle, but dual NEON issue could +@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +@ for further details. On side note Cortex-A15 processes one byte in +@ 16 cycles. + +@ Byte order [in]dependence. ========================================= +@ +@ Originally caller was expected to maintain specific *dword* order in +@ h[0-7], namely with most significant dword at *lower* address, which +@ was reflected in below two parameters as 0 and 4. Now caller is +@ expected to maintain native byte order for whole 64-bit values. +#ifndef __KERNEL__ +# include +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. + + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + + +.align 5 +K512: + WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) + WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) + WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) + WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) + WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) + WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) + WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) + WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) + WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) + WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) + WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) + WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) + WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) + WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) + WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) + WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) + WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) + WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) + WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) + WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) + WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) + WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) + WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) + WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) + WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) + WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) + WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) + WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) + WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) + WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) + WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) + WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) + WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) + WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) + WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) + WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) + WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) + WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) + WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) + WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lsha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.globl _sha512_block_data_order +.private_extern _sha512_block_data_order +#ifdef __thumb2__ +.thumb_func _sha512_block_data_order +#endif +_sha512_block_data_order: +Lsha512_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ _sha512_block_data_order +#else + adr r3,Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + bne LNEON +#endif + add r2,r1,r2,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + sub r14,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +L00_15: +#if __ARM_ARCH__<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.globl _sha512_block_data_order_neon +.private_extern _sha512_block_data_order_neon +#ifdef __thumb2__ +.thumb_func _sha512_block_data_order_neon +#endif +.align 4 +_sha512_block_data_order_neon: +LNEON: + dmb @ errata #451034 on early Cortex A8 + add r2,r1,r2,lsl#7 @ len to point at the end of inp + adr r3,K512 + VFP_ABI_PUSH + vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context +Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 0>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 1>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 2>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 3>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 4>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 5>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 6>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 7>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 8>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 9>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 10>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 11>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 12>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 13>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 14>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 15>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + mov r12,#4 +L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 17>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 19>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 21>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 23>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 25>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 27>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 29>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 31>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + bne L16_79_neon + + vadd.i64 d16,d30 @ h+=Maj from the past + vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne Loop_neon + + VFP_ABI_POP + bx lr @ .word 0xe12fff1e + +#endif +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S new file mode 100644 index 0000000000..6aead7cac2 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S @@ -0,0 +1,1265 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + + + + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + + +.align 7 @ totally strategic alignment +_vpaes_consts: +Lk_mc_forward:@ mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward:@ mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr:@ sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +Lk_inv:@ inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt:@ input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo:@ sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1:@ sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2:@ sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_preheat +#endif +.align 4 +_vpaes_preheat: + adr r10, Lk_inv + vmov.i8 q9, #0x0f @ Lk_s0F + vld1.64 {q10,q11}, [r10]! @ Lk_inv + add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo + vld1.64 {q12,q13}, [r10]! @ Lk_sb1 + vld1.64 {q14,q15}, [r10] @ Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [r2] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_encrypt_core +#endif +.align 4 +_vpaes_encrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + adr r11, Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 d3, {q2}, d3 + vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 d5, {q3}, d1 + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b Lenc_entry + +.align 4 +Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 d9, {q13}, d5 + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 d1, {q12}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 d11, {q15}, d5 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 d5, {q14}, d7 + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 d7, {q0}, d3 + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 d11, {q0}, d9 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 d9, {q3}, d3 + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 d11, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16 + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 d5, {q0}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 d1, {q2}, d3 + bx lr + + +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt +#ifdef __thumb2__ +.thumb_func _vpaes_encrypt +#endif +.align 4 +_vpaes_encrypt: + @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack + @ alignment. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_encrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + + +@ +@ Decryption stuff +@ + +.align 4 +_vpaes_decrypt_consts: +Lk_dipt:@ decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo:@ decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9:@ decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd:@ decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb:@ decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe:@ decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + + +@@ +@@ Decryption core +@@ +@@ Same API as encryption core, except it clobbers q12-q15 rather than using +@@ the values from _vpaes_preheat. q9-q11 must still be set from +@@ _vpaes_preheat. +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_decrypt_core +#endif +.align 4 +_vpaes_decrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + + @ This function performs shuffles with various constants. The x86_64 + @ version loads them on-demand into %xmm0-%xmm5. This does not work well + @ for ARMv7 because those registers are shuffle destinations. The ARMv8 + @ version preloads those constants into registers, but ARMv7 has half + @ the registers to work with. Instead, we load them on-demand into + @ q12-q15, registers normally use for preloaded constants. This is fine + @ because decryption doesn't use those constants. The values are + @ constant, so this does not interfere with potential 2x optimizations. + adr r7, Lk_dipt + + vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo + lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 + eor r11, r11, #0x30 @ xor $0x30, %r11 + adr r10, Lk_sr + and r11, r11, #0x30 @ and $0x30, %r11 + add r11, r11, r10 + adr r10, Lk_mc_forward+48 + + vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q12}, d3 + vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5 + @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q13}, d1 + veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Ldec_entry ends with a bnz instruction which is normally paired with + @ subs in .Ldec_loop. + tst r8, r8 + b Ldec_entry + +.align 4 +Ldec_loop: +@ +@ Inverse mix columns +@ + + @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of + @ the function. + adr r10, Lk_dsb9 + vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + vtbl.8 d9, {q12}, d5 + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + vtbl.8 d3, {q13}, d7 + veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 + + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + @ Load sbb* ahead of time. + vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu + @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + vtbl.8 d3, {q15}, d7 + @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu + @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet + + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + vtbl.8 d9, {q12}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + vtbl.8 d3, {q13}, d7 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + vtbl.8 d3, {q15}, d7 + vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + subs r8, r8, #1 @ sub $1,%rax # nr-- + +Ldec_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 + bne Ldec_loop + + @ middle of last round + + adr r10, Lk_dsbo + + @ Write to q1 rather than q4 to avoid overlapping table and destination. + vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + @ Write to q2 rather than q1 to avoid overlapping table and destination. + vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + vtbl.8 d3, {q2}, d7 + vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + @ Write to q1 rather than q0 so the table and destination registers + @ below do not overlap. + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A + vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 + vtbl.8 d1, {q1}, d5 + bx lr + + +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt +#ifdef __thumb2__ +.thumb_func _vpaes_decrypt +#endif +.align 4 +_vpaes_decrypt: + @ _vpaes_decrypt_core uses r7-r11. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_decrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ + +.align 4 +_vpaes_key_consts: +Lk_dksd:@ decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb:@ decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9:@ decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon:@ rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt:@ output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew:@ deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + +#ifdef __thumb2__ +.thumb_func _vpaes_key_preheat +#endif +.align 4 +_vpaes_key_preheat: + adr r11, Lk_rcon + vmov.i8 q12, #0x5b @ Lk_s63 + adr r10, Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 q9, #0x0f @ Lk_s0F + vld1.64 {q10,q11}, [r10] @ Lk_inv + vld1.64 {q8}, [r11] @ Lk_rcon + bx lr + + +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_core +#endif +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + tst r3, r3 + bne Lschedule_am_decrypting + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + @ decrypting, output zeroth round key after shiftrows + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q4}, d3 + vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) + eor r8, r8, #0x30 @ xor $0x30, %r8 + +Lschedule_go: + cmp r1, #192 @ cmp $192, %esi + bhi Lschedule_256 + beq Lschedule_192 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +Lschedule_128: + mov r0, #10 @ mov $10, %esi + +Loop_schedule_128: + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b Loop_schedule_128 + +@@ +@@ .aes_schedule_192 +@@ +@@ 192-bit specific part of key schedule. +@@ +@@ The main body of this schedule is the same as the 128-bit +@@ schedule, but with more smearing. The long, high side is +@@ stored in q7 as before, and the short, low side is in +@@ the high bits of q6. +@@ +@@ This schedule is somewhat nastier, however, because each +@@ round produces 192 bits of key material, or 1.5 round keys. +@@ Therefore, on each cycle we do 2 rounds and produce 3 round +@@ keys. +@@ +.align 4 +Lschedule_192: + sub r0, r0, #8 + vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform @ input transform + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part + vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 + @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov r0, #4 @ mov $4, %esi + +Loop_schedule_192: + bl _vpaes_schedule_round + vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle @ save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle @ save key n+1 + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq Lschedule_mangle_last + bl _vpaes_schedule_mangle @ save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +Lschedule_256: + vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov r0, #7 @ mov $7, %esi + +Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew + tst r3, r3 + bne Lschedule_mangle_last_dec + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform + add r2, r2, #32 @ add $32, %rdx + vmov q2, q0 + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 d1, {q2}, d3 + +Lschedule_mangle_last_dec: + sub r2, r2, #16 @ add $-16, %rdx + veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return + + +@@ +@@ .aes_schedule_192_smear +@@ +@@ Smear the short, low side in the 192-bit key schedule. +@@ +@@ Inputs: +@@ q7: high side, b a x y +@@ q6: low side, d c 0 0 +@@ +@@ Outputs: +@@ q6: b+c+d b+c 0 0 +@@ q0: b+c+d b+c b a +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_192_smear +#endif +.align 4 +_vpaes_schedule_192_smear: + vmov.i8 q1, #0 + vdup.32 q0, d15[1] + vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + vmov q0, q6 @ vmovdqa %xmm6, %xmm0 + vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + bx lr + + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_round +#endif +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 + vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 d7, {q10}, d7 + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 d5, {q10}, d9 + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 d9, {q15}, d7 + vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 d3, {q14}, d5 + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr + + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_transform +#endif +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d3 + vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q15}, d1 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr + + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_mangle +#endif +.align 4 +_vpaes_schedule_mangle: + tst r3, r3 + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5 + bne Lschedule_mangle_dec + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4 + add r2, r2, #16 @ add $16, %rdx + vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 d9, {q2}, d11 + vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 d3, {q4}, d11 + vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 d7, {q1}, d11 + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + @ inverse mix columns + adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11 + vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi + vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 + @ vmovdqa 0x10(%r11), %xmm3 + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dksb ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 + @ vmovdqa 0x30(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 + @ vmovdqa 0x50(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 + @ vmovdqa 0x70(%r11), %xmm4 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 + vtbl.8 d9, {q15}, d3 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 + + sub r2, r2, #16 @ add $-16, %rdx + +Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d5, {q3}, d3 + add r8, r8, #64-16 @ add $-16, %r8 + and r8, r8, #~(1<<6) @ and $0x30, %r8 + vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) + bx lr + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key +#ifdef __thumb2__ +.thumb_func _vpaes_set_encrypt_key +#endif +.align 4 +_vpaes_set_encrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov r3, #0 @ mov $0,%ecx + mov r8, #0x30 @ mov $0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + + +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key +#ifdef __thumb2__ +.thumb_func _vpaes_set_decrypt_key +#endif +.align 4 +_vpaes_set_decrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl r9, r9, #4 @ shl $4,%eax + add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx + add r2, r2, r9 + + mov r3, #1 @ mov $1,%ecx + lsr r8, r1, #1 @ shr $1,%r8d + and r8, r8, #32 @ and $32,%r8d + eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + + +@ Additional constants for converting to bsaes. + +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) +Lk_opt_then_skew: +.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b +.quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation +@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 +@ becomes 0x22334411 and then 0x11443322. +Lk_decrypt_transform: +.quad 0x0704050603000102, 0x0f0c0d0e0b08090a + + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl _vpaes_encrypt_key_to_bsaes +.private_extern _vpaes_encrypt_key_to_bsaes +#ifdef __thumb2__ +.thumb_func _vpaes_encrypt_key_to_bsaes +#endif +.align 4 +_vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {q12}, [r2] + vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64 + adr r11, Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +Loop_enc_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + vmov q0, q1 + vtbl.8 d4, {q1}, d24 + vtbl.8 d5, {q1}, d25 + veor q0, q0, q2 + vtbl.8 d2, {q2}, d24 + vtbl.8 d3, {q2}, d25 + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, q10 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + b Loop_enc_key_to_bsaes + +Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ q11, not q10. + veor q0, q0, q11 + vrev32.8 q0, q0 + vst1.64 {q0}, [r0] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return + + +@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); +.globl _vpaes_decrypt_key_to_bsaes +.private_extern _vpaes_decrypt_key_to_bsaes +#ifdef __thumb2__ +.thumb_func _vpaes_decrypt_key_to_bsaes +#endif +.align 4 +_vpaes_decrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. Note vpaes + @ computes the decryption key schedule in reverse. Additionally, + @ aes-x86_64.pl shares some transformations, so we must only partially + @ invert vpaes's transformations. In general, vpaes computes in a + @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of + @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is + @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + adr r2, Lk_decrypt_transform + adr r3, Lk_sr+0x30 + adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform. + vld1.64 {q12}, [r2] @ Reuse q12 from encryption. + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ Undo the basis change and reapply the S-box affine transform. See + @ .Lschedule_mangle_last. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ See _vpaes_schedule_mangle for the transform on the middle keys. Note + @ it simultaneously inverts MixColumns and the S-box affine transform. + @ See .Lk_dksd through .Lk_dks9. +Loop_dec_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going + @ forwards cancels inverting for which direction we cycle r3. We use r3 + @ rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #64-16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq Loop_dec_key_to_bsaes_last + + @ Undo the basis change and reapply the S-box affine transform. + bl _vpaes_schedule_transform + + @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We + @ combine the two operations in .Lk_decrypt_transform. + @ + @ TODO(davidben): Where does the rotation come from? + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + + vst1.64 {q1}, [r0]! + b Loop_dec_key_to_bsaes + +Loop_dec_key_to_bsaes_last: + @ The final key only inverts ShiftRows (already done in the loop). See + @ .Lschedule_am_decrypting. Its basis is not transformed. + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks +#ifdef __thumb2__ +.thumb_func _vpaes_ctr32_encrypt_blocks +#endif +.align 4 +_vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7,r8,r9,r10,r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + cmp r2, #0 + @ r8 is passed on the stack. + ldr r8, [ip] + beq Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. + mov r9, r3 + mov r3, r2 + mov r2, r9 + + @ Load the IV and counter portion. + ldr r7, [r8, #12] + vld1.8 {q7}, [r8] + + bl _vpaes_preheat + rev r7, r7 @ The counter is big-endian. + +Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [r0]! @ Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [r1]! + subs r3, r3, #1 + @ Update the counter. + add r7, r7, #1 + rev r9, r7 + vmov.32 d15[1], r9 + bne Lctr32_loop + +Lctr32_done: + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S new file mode 100644 index 0000000000..51ac249ef5 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S @@ -0,0 +1,377 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + + + + +.text + +@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| +@ with |argv|, then saves the callee-saved registers into |state|. It returns +@ the result of |func|. The |unwind| argument is unused. +@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, +@ const uint32_t *argv, size_t argc, +@ int unwind); + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +Labi_test_trampoline_begin: + @ Save parameters and all callee-saved registers. For convenience, we + @ save r9 on iOS even though it's volatile. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} + + @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 + @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). + sub sp, sp, #28 + + @ Every register in AAPCS is either non-volatile or a parameter (except + @ r9 on iOS), so this code, by the actual call, loses all its scratch + @ registers. First fill in stack parameters while there are registers + @ to spare. + cmp r3, #4 + bls Lstack_args_done + mov r4, sp @ r4 is the output pointer. + add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. + add r2, r2, #16 @ Skip four arguments. +Lstack_args_loop: + ldr r6, [r2], #4 + cmp r2, r5 + str r6, [r4], #4 + bne Lstack_args_loop + +Lstack_args_done: + @ Load registers from |r1|. + vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Load register parameters. This uses up our remaining registers, so we + @ repurpose lr as scratch space. + ldr r3, [sp, #40] @ Reload argc. + ldr lr, [sp, #36] @ Load argv into lr. + cmp r3, #3 + bhi Larg_r3 + beq Larg_r2 + cmp r3, #1 + bhi Larg_r1 + beq Larg_r0 + b Largs_done + +Larg_r3: + ldr r3, [lr, #12] @ argv[3] +Larg_r2: + ldr r2, [lr, #8] @ argv[2] +Larg_r1: + ldr r1, [lr, #4] @ argv[1] +Larg_r0: + ldr r0, [lr] @ argv[0] +Largs_done: + + @ With every other register in use, load the function pointer into lr + @ and call the function. + ldr lr, [sp, #28] + blx lr + + @ r1-r3 are free for use again. The trampoline only supports + @ single-return functions. Pass r4-r11 to the caller. + ldr r1, [sp, #32] + vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + stmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Unwind the stack and restore registers. + add sp, sp, #44 @ 44 = 28+16 + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + bx lr + + +.globl _abi_test_clobber_r0 +.private_extern _abi_test_clobber_r0 +.align 4 +_abi_test_clobber_r0: + mov r0, #0 + bx lr + + +.globl _abi_test_clobber_r1 +.private_extern _abi_test_clobber_r1 +.align 4 +_abi_test_clobber_r1: + mov r1, #0 + bx lr + + +.globl _abi_test_clobber_r2 +.private_extern _abi_test_clobber_r2 +.align 4 +_abi_test_clobber_r2: + mov r2, #0 + bx lr + + +.globl _abi_test_clobber_r3 +.private_extern _abi_test_clobber_r3 +.align 4 +_abi_test_clobber_r3: + mov r3, #0 + bx lr + + +.globl _abi_test_clobber_r4 +.private_extern _abi_test_clobber_r4 +.align 4 +_abi_test_clobber_r4: + mov r4, #0 + bx lr + + +.globl _abi_test_clobber_r5 +.private_extern _abi_test_clobber_r5 +.align 4 +_abi_test_clobber_r5: + mov r5, #0 + bx lr + + +.globl _abi_test_clobber_r6 +.private_extern _abi_test_clobber_r6 +.align 4 +_abi_test_clobber_r6: + mov r6, #0 + bx lr + + +.globl _abi_test_clobber_r7 +.private_extern _abi_test_clobber_r7 +.align 4 +_abi_test_clobber_r7: + mov r7, #0 + bx lr + + +.globl _abi_test_clobber_r8 +.private_extern _abi_test_clobber_r8 +.align 4 +_abi_test_clobber_r8: + mov r8, #0 + bx lr + + +.globl _abi_test_clobber_r9 +.private_extern _abi_test_clobber_r9 +.align 4 +_abi_test_clobber_r9: + mov r9, #0 + bx lr + + +.globl _abi_test_clobber_r10 +.private_extern _abi_test_clobber_r10 +.align 4 +_abi_test_clobber_r10: + mov r10, #0 + bx lr + + +.globl _abi_test_clobber_r11 +.private_extern _abi_test_clobber_r11 +.align 4 +_abi_test_clobber_r11: + mov r11, #0 + bx lr + + +.globl _abi_test_clobber_r12 +.private_extern _abi_test_clobber_r12 +.align 4 +_abi_test_clobber_r12: + mov r12, #0 + bx lr + + +.globl _abi_test_clobber_d0 +.private_extern _abi_test_clobber_d0 +.align 4 +_abi_test_clobber_d0: + mov r0, #0 + vmov s0, r0 + vmov s1, r0 + bx lr + + +.globl _abi_test_clobber_d1 +.private_extern _abi_test_clobber_d1 +.align 4 +_abi_test_clobber_d1: + mov r0, #0 + vmov s2, r0 + vmov s3, r0 + bx lr + + +.globl _abi_test_clobber_d2 +.private_extern _abi_test_clobber_d2 +.align 4 +_abi_test_clobber_d2: + mov r0, #0 + vmov s4, r0 + vmov s5, r0 + bx lr + + +.globl _abi_test_clobber_d3 +.private_extern _abi_test_clobber_d3 +.align 4 +_abi_test_clobber_d3: + mov r0, #0 + vmov s6, r0 + vmov s7, r0 + bx lr + + +.globl _abi_test_clobber_d4 +.private_extern _abi_test_clobber_d4 +.align 4 +_abi_test_clobber_d4: + mov r0, #0 + vmov s8, r0 + vmov s9, r0 + bx lr + + +.globl _abi_test_clobber_d5 +.private_extern _abi_test_clobber_d5 +.align 4 +_abi_test_clobber_d5: + mov r0, #0 + vmov s10, r0 + vmov s11, r0 + bx lr + + +.globl _abi_test_clobber_d6 +.private_extern _abi_test_clobber_d6 +.align 4 +_abi_test_clobber_d6: + mov r0, #0 + vmov s12, r0 + vmov s13, r0 + bx lr + + +.globl _abi_test_clobber_d7 +.private_extern _abi_test_clobber_d7 +.align 4 +_abi_test_clobber_d7: + mov r0, #0 + vmov s14, r0 + vmov s15, r0 + bx lr + + +.globl _abi_test_clobber_d8 +.private_extern _abi_test_clobber_d8 +.align 4 +_abi_test_clobber_d8: + mov r0, #0 + vmov s16, r0 + vmov s17, r0 + bx lr + + +.globl _abi_test_clobber_d9 +.private_extern _abi_test_clobber_d9 +.align 4 +_abi_test_clobber_d9: + mov r0, #0 + vmov s18, r0 + vmov s19, r0 + bx lr + + +.globl _abi_test_clobber_d10 +.private_extern _abi_test_clobber_d10 +.align 4 +_abi_test_clobber_d10: + mov r0, #0 + vmov s20, r0 + vmov s21, r0 + bx lr + + +.globl _abi_test_clobber_d11 +.private_extern _abi_test_clobber_d11 +.align 4 +_abi_test_clobber_d11: + mov r0, #0 + vmov s22, r0 + vmov s23, r0 + bx lr + + +.globl _abi_test_clobber_d12 +.private_extern _abi_test_clobber_d12 +.align 4 +_abi_test_clobber_d12: + mov r0, #0 + vmov s24, r0 + vmov s25, r0 + bx lr + + +.globl _abi_test_clobber_d13 +.private_extern _abi_test_clobber_d13 +.align 4 +_abi_test_clobber_d13: + mov r0, #0 + vmov s26, r0 + vmov s27, r0 + bx lr + + +.globl _abi_test_clobber_d14 +.private_extern _abi_test_clobber_d14 +.align 4 +_abi_test_clobber_d14: + mov r0, #0 + vmov s28, r0 + vmov s29, r0 + bx lr + + +.globl _abi_test_clobber_d15 +.private_extern _abi_test_clobber_d15 +.align 4 +_abi_test_clobber_d15: + mov r0, #0 + vmov s30, r0 + vmov s31, r0 + bx lr + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S index 6ff6bffb66..49449bf532 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S @@ -1,39 +1,48 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include -.text +.section .rodata .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 +.text + .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 .type ChaCha20_ctr32,%function .align 5 ChaCha20_ctr32: cbz x2,.Labort - adr x5,.LOPENSSL_armcap_P +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x5,:pg_hi21_nc:OPENSSL_armcap_P +#else + adrp x5,OPENSSL_armcap_P +#endif cmp x2,#192 b.lo .Lshort -#ifdef __ILP32__ - ldrsw x6,[x5] -#else - ldr x6,[x5] -#endif - ldr w17,[x6,x5] + ldr w17,[x5,:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON b.ne ChaCha20_neon @@ -41,7 +50,8 @@ ChaCha20_ctr32: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -314,7 +324,8 @@ ChaCha20_neon: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -807,7 +818,8 @@ ChaCha20_512_neon: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -1969,3 +1981,5 @@ ChaCha20_512_neon: ret .size ChaCha20_512_neon,.-ChaCha20_512_neon #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S index 51e2464487..60c70a24fd 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S @@ -1,17 +1,32 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include #if __ARM_MAX_ARCH__>=7 .text -#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) .arch armv8-a+crypto -#endif +.section .rodata .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b +.text + .globl aes_hw_set_encrypt_key .hidden aes_hw_set_encrypt_key .type aes_hw_set_encrypt_key,%function @@ -33,7 +48,8 @@ aes_hw_set_encrypt_key: tst w1,#0x3f b.ne .Lenc_key_abort - adr x3,.Lrcon + adrp x3,.Lrcon + add x3,x3,:lo12:.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b @@ -755,3 +771,5 @@ aes_hw_ctr32_encrypt_blocks: .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S index 74702db6aa..360bf4c7fe 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_mont @@ -1405,3 +1419,5 @@ __bn_mul4x_mont: .align 2 .align 4 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S new file mode 100644 index 0000000000..f876db3f89 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S @@ -0,0 +1,341 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne .Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S index 89d780ff69..37d97317aa 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S @@ -1,10 +1,22 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text -#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) .arch armv8-a+crypto -#endif .globl gcm_init_v8 .hidden gcm_init_v8 .type gcm_init_v8,%function @@ -108,13 +120,13 @@ gcm_ghash_v8: //loaded value would have //to be rotated in order to //make it appear as in - //alorithm specification + //algorithm specification subs x3,x3,#32 //see if x3 is 32 or larger mov x12,#16 //x12 is used as post- //increment for input pointer; //as loop is modulo-scheduled //x12 is zeroed just in time - //to preclude oversteping + //to preclude overstepping //inp[len], which means that //last block[s] are actually //loaded twice, but last @@ -233,3 +245,5 @@ gcm_ghash_v8: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S index ff361f454a..f681b9983f 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text @@ -9,14 +23,12 @@ .type sha1_block_data_order,%function .align 6 sha1_block_data_order: -#ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:OPENSSL_armcap_P #else - ldr x16,.LOPENSSL_armcap_P + adrp x16,OPENSSL_armcap_P #endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + ldr w16,[x16,:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry @@ -1082,7 +1094,8 @@ sha1_block_armv8: stp x29,x30,[sp,#-16]! add x29,sp,#0 - adr x4,.Lconst + adrp x4,.Lconst + add x4,x4,:lo12:.Lconst eor v1.16b,v1.16b,v1.16b ld1 {v0.4s},[x0],#16 ld1 {v1.s}[0],[x0] @@ -1205,20 +1218,18 @@ sha1_block_armv8: ldr x29,[sp],#16 ret .size sha1_block_armv8,.-sha1_block_armv8 +.section .rodata .align 6 .Lconst: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S index 19db33937e..6e09f69a94 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use @@ -51,14 +65,12 @@ .align 6 sha256_block_data_order: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:OPENSSL_armcap_P +#else + adrp x16,OPENSSL_armcap_P +#endif + ldr w16,[x16,:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry #endif @@ -77,7 +89,8 @@ sha256_block_data_order: ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] - adr x30,.LK256 + adrp x30,.LK256 + add x30,x30,:lo12:.LK256 stp x0,x2,[x29,#96] .Loop: @@ -1024,6 +1037,7 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-sha256_block_data_order +.section .rodata .align 6 .type .LK256,%object .LK256: @@ -1045,18 +1059,10 @@ sha256_block_data_order: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 +.text #ifndef __KERNEL__ .type sha256_block_armv8,%function .align 6 @@ -1066,7 +1072,8 @@ sha256_block_armv8: add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 + adrp x3,.LK256 + add x3,x3,:lo12:.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 @@ -1199,5 +1206,8 @@ sha256_block_armv8: #endif #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S index bb052b7551..7b9b22a02a 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use @@ -65,7 +79,8 @@ sha512_block_data_order: ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] - adr x30,.LK512 + adrp x30,.LK512 + add x30,x30,:lo12:.LK512 stp x0,x2,[x29,#96] .Loop: @@ -1012,6 +1027,7 @@ sha512_block_data_order: ret .size sha512_block_data_order,.-sha512_block_data_order +.section .rodata .align 6 .type .LK512,%object .LK512: @@ -1057,19 +1073,13 @@ sha512_block_data_order: .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .size .LK512,.-.LK512 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S new file mode 100644 index 0000000000..f57b7b5174 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S @@ -0,0 +1,1216 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +.Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_encrypt,.-vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x + +.type _vpaes_decrypt_preheat,%function +.align 4 +_vpaes_decrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + adrp x11, .Lk_dipt + add x11, x11, :lo12:.Lk_dipt + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + ret +.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, .Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, .Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Ldec_entry + +.align 4 +.Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_decrypt,.-vpaes_decrypt + +// v14-v15 input, v0-v1 output +.type _vpaes_decrypt_2x,%function +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, .Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, .Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Ldec_2x_entry + +.align 4 +.Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret +.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, .Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, .Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 + adrp x11, .Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + cbnz w3, .Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +.Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov x0, #10 // mov $10, %esi + +.Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +.Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +.Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz w3, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, .Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key +.globl vpaes_cbc_encrypt +.hidden vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,%function +.align 4 +vpaes_cbc_encrypt: + cbz x2, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b .Lcbc_enc_loop + +.align 4 +.Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi .Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +.Lcbc_abort: + ret +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + +.type vpaes_cbc_decrypt,%function +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls .Lcbc_dec_done + +.align 4 +.Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi .Lcbc_dec_loop2x + +.Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, .Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // .Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S new file mode 100644 index 0000000000..9a21cc2c6e --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S @@ -0,0 +1,688 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); +.type abi_test_trampoline, %function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 4 +abi_test_trampoline: +.Labi_test_trampoline_begin: + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, .Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x7, [x10], #8 + +.Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq .Lx29_ok + str xzr, [x1] + +.Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + ret +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_x0, %function +.globl abi_test_clobber_x0 +.hidden abi_test_clobber_x0 +.align 4 +abi_test_clobber_x0: + mov x0, xzr + ret +.size abi_test_clobber_x0,.-abi_test_clobber_x0 +.type abi_test_clobber_x1, %function +.globl abi_test_clobber_x1 +.hidden abi_test_clobber_x1 +.align 4 +abi_test_clobber_x1: + mov x1, xzr + ret +.size abi_test_clobber_x1,.-abi_test_clobber_x1 +.type abi_test_clobber_x2, %function +.globl abi_test_clobber_x2 +.hidden abi_test_clobber_x2 +.align 4 +abi_test_clobber_x2: + mov x2, xzr + ret +.size abi_test_clobber_x2,.-abi_test_clobber_x2 +.type abi_test_clobber_x3, %function +.globl abi_test_clobber_x3 +.hidden abi_test_clobber_x3 +.align 4 +abi_test_clobber_x3: + mov x3, xzr + ret +.size abi_test_clobber_x3,.-abi_test_clobber_x3 +.type abi_test_clobber_x4, %function +.globl abi_test_clobber_x4 +.hidden abi_test_clobber_x4 +.align 4 +abi_test_clobber_x4: + mov x4, xzr + ret +.size abi_test_clobber_x4,.-abi_test_clobber_x4 +.type abi_test_clobber_x5, %function +.globl abi_test_clobber_x5 +.hidden abi_test_clobber_x5 +.align 4 +abi_test_clobber_x5: + mov x5, xzr + ret +.size abi_test_clobber_x5,.-abi_test_clobber_x5 +.type abi_test_clobber_x6, %function +.globl abi_test_clobber_x6 +.hidden abi_test_clobber_x6 +.align 4 +abi_test_clobber_x6: + mov x6, xzr + ret +.size abi_test_clobber_x6,.-abi_test_clobber_x6 +.type abi_test_clobber_x7, %function +.globl abi_test_clobber_x7 +.hidden abi_test_clobber_x7 +.align 4 +abi_test_clobber_x7: + mov x7, xzr + ret +.size abi_test_clobber_x7,.-abi_test_clobber_x7 +.type abi_test_clobber_x8, %function +.globl abi_test_clobber_x8 +.hidden abi_test_clobber_x8 +.align 4 +abi_test_clobber_x8: + mov x8, xzr + ret +.size abi_test_clobber_x8,.-abi_test_clobber_x8 +.type abi_test_clobber_x9, %function +.globl abi_test_clobber_x9 +.hidden abi_test_clobber_x9 +.align 4 +abi_test_clobber_x9: + mov x9, xzr + ret +.size abi_test_clobber_x9,.-abi_test_clobber_x9 +.type abi_test_clobber_x10, %function +.globl abi_test_clobber_x10 +.hidden abi_test_clobber_x10 +.align 4 +abi_test_clobber_x10: + mov x10, xzr + ret +.size abi_test_clobber_x10,.-abi_test_clobber_x10 +.type abi_test_clobber_x11, %function +.globl abi_test_clobber_x11 +.hidden abi_test_clobber_x11 +.align 4 +abi_test_clobber_x11: + mov x11, xzr + ret +.size abi_test_clobber_x11,.-abi_test_clobber_x11 +.type abi_test_clobber_x12, %function +.globl abi_test_clobber_x12 +.hidden abi_test_clobber_x12 +.align 4 +abi_test_clobber_x12: + mov x12, xzr + ret +.size abi_test_clobber_x12,.-abi_test_clobber_x12 +.type abi_test_clobber_x13, %function +.globl abi_test_clobber_x13 +.hidden abi_test_clobber_x13 +.align 4 +abi_test_clobber_x13: + mov x13, xzr + ret +.size abi_test_clobber_x13,.-abi_test_clobber_x13 +.type abi_test_clobber_x14, %function +.globl abi_test_clobber_x14 +.hidden abi_test_clobber_x14 +.align 4 +abi_test_clobber_x14: + mov x14, xzr + ret +.size abi_test_clobber_x14,.-abi_test_clobber_x14 +.type abi_test_clobber_x15, %function +.globl abi_test_clobber_x15 +.hidden abi_test_clobber_x15 +.align 4 +abi_test_clobber_x15: + mov x15, xzr + ret +.size abi_test_clobber_x15,.-abi_test_clobber_x15 +.type abi_test_clobber_x16, %function +.globl abi_test_clobber_x16 +.hidden abi_test_clobber_x16 +.align 4 +abi_test_clobber_x16: + mov x16, xzr + ret +.size abi_test_clobber_x16,.-abi_test_clobber_x16 +.type abi_test_clobber_x17, %function +.globl abi_test_clobber_x17 +.hidden abi_test_clobber_x17 +.align 4 +abi_test_clobber_x17: + mov x17, xzr + ret +.size abi_test_clobber_x17,.-abi_test_clobber_x17 +.type abi_test_clobber_x19, %function +.globl abi_test_clobber_x19 +.hidden abi_test_clobber_x19 +.align 4 +abi_test_clobber_x19: + mov x19, xzr + ret +.size abi_test_clobber_x19,.-abi_test_clobber_x19 +.type abi_test_clobber_x20, %function +.globl abi_test_clobber_x20 +.hidden abi_test_clobber_x20 +.align 4 +abi_test_clobber_x20: + mov x20, xzr + ret +.size abi_test_clobber_x20,.-abi_test_clobber_x20 +.type abi_test_clobber_x21, %function +.globl abi_test_clobber_x21 +.hidden abi_test_clobber_x21 +.align 4 +abi_test_clobber_x21: + mov x21, xzr + ret +.size abi_test_clobber_x21,.-abi_test_clobber_x21 +.type abi_test_clobber_x22, %function +.globl abi_test_clobber_x22 +.hidden abi_test_clobber_x22 +.align 4 +abi_test_clobber_x22: + mov x22, xzr + ret +.size abi_test_clobber_x22,.-abi_test_clobber_x22 +.type abi_test_clobber_x23, %function +.globl abi_test_clobber_x23 +.hidden abi_test_clobber_x23 +.align 4 +abi_test_clobber_x23: + mov x23, xzr + ret +.size abi_test_clobber_x23,.-abi_test_clobber_x23 +.type abi_test_clobber_x24, %function +.globl abi_test_clobber_x24 +.hidden abi_test_clobber_x24 +.align 4 +abi_test_clobber_x24: + mov x24, xzr + ret +.size abi_test_clobber_x24,.-abi_test_clobber_x24 +.type abi_test_clobber_x25, %function +.globl abi_test_clobber_x25 +.hidden abi_test_clobber_x25 +.align 4 +abi_test_clobber_x25: + mov x25, xzr + ret +.size abi_test_clobber_x25,.-abi_test_clobber_x25 +.type abi_test_clobber_x26, %function +.globl abi_test_clobber_x26 +.hidden abi_test_clobber_x26 +.align 4 +abi_test_clobber_x26: + mov x26, xzr + ret +.size abi_test_clobber_x26,.-abi_test_clobber_x26 +.type abi_test_clobber_x27, %function +.globl abi_test_clobber_x27 +.hidden abi_test_clobber_x27 +.align 4 +abi_test_clobber_x27: + mov x27, xzr + ret +.size abi_test_clobber_x27,.-abi_test_clobber_x27 +.type abi_test_clobber_x28, %function +.globl abi_test_clobber_x28 +.hidden abi_test_clobber_x28 +.align 4 +abi_test_clobber_x28: + mov x28, xzr + ret +.size abi_test_clobber_x28,.-abi_test_clobber_x28 +.type abi_test_clobber_x29, %function +.globl abi_test_clobber_x29 +.hidden abi_test_clobber_x29 +.align 4 +abi_test_clobber_x29: + mov x29, xzr + ret +.size abi_test_clobber_x29,.-abi_test_clobber_x29 +.type abi_test_clobber_d0, %function +.globl abi_test_clobber_d0 +.hidden abi_test_clobber_d0 +.align 4 +abi_test_clobber_d0: + fmov d0, xzr + ret +.size abi_test_clobber_d0,.-abi_test_clobber_d0 +.type abi_test_clobber_d1, %function +.globl abi_test_clobber_d1 +.hidden abi_test_clobber_d1 +.align 4 +abi_test_clobber_d1: + fmov d1, xzr + ret +.size abi_test_clobber_d1,.-abi_test_clobber_d1 +.type abi_test_clobber_d2, %function +.globl abi_test_clobber_d2 +.hidden abi_test_clobber_d2 +.align 4 +abi_test_clobber_d2: + fmov d2, xzr + ret +.size abi_test_clobber_d2,.-abi_test_clobber_d2 +.type abi_test_clobber_d3, %function +.globl abi_test_clobber_d3 +.hidden abi_test_clobber_d3 +.align 4 +abi_test_clobber_d3: + fmov d3, xzr + ret +.size abi_test_clobber_d3,.-abi_test_clobber_d3 +.type abi_test_clobber_d4, %function +.globl abi_test_clobber_d4 +.hidden abi_test_clobber_d4 +.align 4 +abi_test_clobber_d4: + fmov d4, xzr + ret +.size abi_test_clobber_d4,.-abi_test_clobber_d4 +.type abi_test_clobber_d5, %function +.globl abi_test_clobber_d5 +.hidden abi_test_clobber_d5 +.align 4 +abi_test_clobber_d5: + fmov d5, xzr + ret +.size abi_test_clobber_d5,.-abi_test_clobber_d5 +.type abi_test_clobber_d6, %function +.globl abi_test_clobber_d6 +.hidden abi_test_clobber_d6 +.align 4 +abi_test_clobber_d6: + fmov d6, xzr + ret +.size abi_test_clobber_d6,.-abi_test_clobber_d6 +.type abi_test_clobber_d7, %function +.globl abi_test_clobber_d7 +.hidden abi_test_clobber_d7 +.align 4 +abi_test_clobber_d7: + fmov d7, xzr + ret +.size abi_test_clobber_d7,.-abi_test_clobber_d7 +.type abi_test_clobber_d8, %function +.globl abi_test_clobber_d8 +.hidden abi_test_clobber_d8 +.align 4 +abi_test_clobber_d8: + fmov d8, xzr + ret +.size abi_test_clobber_d8,.-abi_test_clobber_d8 +.type abi_test_clobber_d9, %function +.globl abi_test_clobber_d9 +.hidden abi_test_clobber_d9 +.align 4 +abi_test_clobber_d9: + fmov d9, xzr + ret +.size abi_test_clobber_d9,.-abi_test_clobber_d9 +.type abi_test_clobber_d10, %function +.globl abi_test_clobber_d10 +.hidden abi_test_clobber_d10 +.align 4 +abi_test_clobber_d10: + fmov d10, xzr + ret +.size abi_test_clobber_d10,.-abi_test_clobber_d10 +.type abi_test_clobber_d11, %function +.globl abi_test_clobber_d11 +.hidden abi_test_clobber_d11 +.align 4 +abi_test_clobber_d11: + fmov d11, xzr + ret +.size abi_test_clobber_d11,.-abi_test_clobber_d11 +.type abi_test_clobber_d12, %function +.globl abi_test_clobber_d12 +.hidden abi_test_clobber_d12 +.align 4 +abi_test_clobber_d12: + fmov d12, xzr + ret +.size abi_test_clobber_d12,.-abi_test_clobber_d12 +.type abi_test_clobber_d13, %function +.globl abi_test_clobber_d13 +.hidden abi_test_clobber_d13 +.align 4 +abi_test_clobber_d13: + fmov d13, xzr + ret +.size abi_test_clobber_d13,.-abi_test_clobber_d13 +.type abi_test_clobber_d14, %function +.globl abi_test_clobber_d14 +.hidden abi_test_clobber_d14 +.align 4 +abi_test_clobber_d14: + fmov d14, xzr + ret +.size abi_test_clobber_d14,.-abi_test_clobber_d14 +.type abi_test_clobber_d15, %function +.globl abi_test_clobber_d15 +.hidden abi_test_clobber_d15 +.align 4 +abi_test_clobber_d15: + fmov d15, xzr + ret +.size abi_test_clobber_d15,.-abi_test_clobber_d15 +.type abi_test_clobber_d16, %function +.globl abi_test_clobber_d16 +.hidden abi_test_clobber_d16 +.align 4 +abi_test_clobber_d16: + fmov d16, xzr + ret +.size abi_test_clobber_d16,.-abi_test_clobber_d16 +.type abi_test_clobber_d17, %function +.globl abi_test_clobber_d17 +.hidden abi_test_clobber_d17 +.align 4 +abi_test_clobber_d17: + fmov d17, xzr + ret +.size abi_test_clobber_d17,.-abi_test_clobber_d17 +.type abi_test_clobber_d18, %function +.globl abi_test_clobber_d18 +.hidden abi_test_clobber_d18 +.align 4 +abi_test_clobber_d18: + fmov d18, xzr + ret +.size abi_test_clobber_d18,.-abi_test_clobber_d18 +.type abi_test_clobber_d19, %function +.globl abi_test_clobber_d19 +.hidden abi_test_clobber_d19 +.align 4 +abi_test_clobber_d19: + fmov d19, xzr + ret +.size abi_test_clobber_d19,.-abi_test_clobber_d19 +.type abi_test_clobber_d20, %function +.globl abi_test_clobber_d20 +.hidden abi_test_clobber_d20 +.align 4 +abi_test_clobber_d20: + fmov d20, xzr + ret +.size abi_test_clobber_d20,.-abi_test_clobber_d20 +.type abi_test_clobber_d21, %function +.globl abi_test_clobber_d21 +.hidden abi_test_clobber_d21 +.align 4 +abi_test_clobber_d21: + fmov d21, xzr + ret +.size abi_test_clobber_d21,.-abi_test_clobber_d21 +.type abi_test_clobber_d22, %function +.globl abi_test_clobber_d22 +.hidden abi_test_clobber_d22 +.align 4 +abi_test_clobber_d22: + fmov d22, xzr + ret +.size abi_test_clobber_d22,.-abi_test_clobber_d22 +.type abi_test_clobber_d23, %function +.globl abi_test_clobber_d23 +.hidden abi_test_clobber_d23 +.align 4 +abi_test_clobber_d23: + fmov d23, xzr + ret +.size abi_test_clobber_d23,.-abi_test_clobber_d23 +.type abi_test_clobber_d24, %function +.globl abi_test_clobber_d24 +.hidden abi_test_clobber_d24 +.align 4 +abi_test_clobber_d24: + fmov d24, xzr + ret +.size abi_test_clobber_d24,.-abi_test_clobber_d24 +.type abi_test_clobber_d25, %function +.globl abi_test_clobber_d25 +.hidden abi_test_clobber_d25 +.align 4 +abi_test_clobber_d25: + fmov d25, xzr + ret +.size abi_test_clobber_d25,.-abi_test_clobber_d25 +.type abi_test_clobber_d26, %function +.globl abi_test_clobber_d26 +.hidden abi_test_clobber_d26 +.align 4 +abi_test_clobber_d26: + fmov d26, xzr + ret +.size abi_test_clobber_d26,.-abi_test_clobber_d26 +.type abi_test_clobber_d27, %function +.globl abi_test_clobber_d27 +.hidden abi_test_clobber_d27 +.align 4 +abi_test_clobber_d27: + fmov d27, xzr + ret +.size abi_test_clobber_d27,.-abi_test_clobber_d27 +.type abi_test_clobber_d28, %function +.globl abi_test_clobber_d28 +.hidden abi_test_clobber_d28 +.align 4 +abi_test_clobber_d28: + fmov d28, xzr + ret +.size abi_test_clobber_d28,.-abi_test_clobber_d28 +.type abi_test_clobber_d29, %function +.globl abi_test_clobber_d29 +.hidden abi_test_clobber_d29 +.align 4 +abi_test_clobber_d29: + fmov d29, xzr + ret +.size abi_test_clobber_d29,.-abi_test_clobber_d29 +.type abi_test_clobber_d30, %function +.globl abi_test_clobber_d30 +.hidden abi_test_clobber_d30 +.align 4 +abi_test_clobber_d30: + fmov d30, xzr + ret +.size abi_test_clobber_d30,.-abi_test_clobber_d30 +.type abi_test_clobber_d31, %function +.globl abi_test_clobber_d31 +.hidden abi_test_clobber_d31 +.align 4 +abi_test_clobber_d31: + fmov d31, xzr + ret +.size abi_test_clobber_d31,.-abi_test_clobber_d31 +.type abi_test_clobber_v8_upper, %function +.globl abi_test_clobber_v8_upper +.hidden abi_test_clobber_v8_upper +.align 4 +abi_test_clobber_v8_upper: + fmov v8.d[1], xzr + ret +.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper +.type abi_test_clobber_v9_upper, %function +.globl abi_test_clobber_v9_upper +.hidden abi_test_clobber_v9_upper +.align 4 +abi_test_clobber_v9_upper: + fmov v9.d[1], xzr + ret +.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper +.type abi_test_clobber_v10_upper, %function +.globl abi_test_clobber_v10_upper +.hidden abi_test_clobber_v10_upper +.align 4 +abi_test_clobber_v10_upper: + fmov v10.d[1], xzr + ret +.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper +.type abi_test_clobber_v11_upper, %function +.globl abi_test_clobber_v11_upper +.hidden abi_test_clobber_v11_upper +.align 4 +abi_test_clobber_v11_upper: + fmov v11.d[1], xzr + ret +.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper +.type abi_test_clobber_v12_upper, %function +.globl abi_test_clobber_v12_upper +.hidden abi_test_clobber_v12_upper +.align 4 +abi_test_clobber_v12_upper: + fmov v12.d[1], xzr + ret +.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper +.type abi_test_clobber_v13_upper, %function +.globl abi_test_clobber_v13_upper +.hidden abi_test_clobber_v13_upper +.align 4 +abi_test_clobber_v13_upper: + fmov v13.d[1], xzr + ret +.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper +.type abi_test_clobber_v14_upper, %function +.globl abi_test_clobber_v14_upper +.hidden abi_test_clobber_v14_upper +.align 4 +abi_test_clobber_v14_upper: + fmov v14.d[1], xzr + ret +.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper +.type abi_test_clobber_v15_upper, %function +.globl abi_test_clobber_v15_upper +.hidden abi_test_clobber_v15_upper +.align 4 +abi_test_clobber_v15_upper: + fmov v15.d[1], xzr + ret +.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S new file mode 100644 index 0000000000..63e8d1c381 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S @@ -0,0 +1,999 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section .rodata + +# p434 x 2 +.Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 + +# p434 + 1 +.Lp434p1: +.quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056, 0x0002341F27177344 + +.text +.globl sike_mpmul +.hidden sike_mpmul +.align 4 +sike_mpmul: + stp x29, x30, [sp,#-96]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + ldp x14, x15, [x1,#32] + ldr x16, [x1,#48] + + // x3-x7 <- AH + AL, x7 <- carry + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, xzr + adc x7, xzr, xzr + + // x10-x13 <- BH + BL, x8 <- carry + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, xzr + adc x8, xzr, xzr + + // x9 <- combined carry + and x9, x7, x8 + // x7-x8 <- mask + sub x7, xzr, x7 + sub x8, xzr, x8 + + // x15-x19 <- masked (BH + BL) + and x14, x10, x7 + and x15, x11, x7 + and x16, x12, x7 + and x17, x13, x7 + + // x20-x23 <- masked (AH + AL) + and x20, x3, x8 + and x21, x4, x8 + and x22, x5, x8 + and x23, x6, x8 + + // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x14, x14, x20 + adcs x15, x15, x21 + adcs x16, x16, x22 + adcs x17, x17, x23 + adc x7, x9, xzr + + // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part + stp x3, x4, [x2,#0] + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x25, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x23, x10, x12 + adcs x26, x11, x13 + adc x24, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x19, xzr, x25 + sub x20, xzr, x24 + and x8, x23, x19 + and x9, x26, x19 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x21, x3, x20 + and x22, x4, x20 + mul x19, x3, x23 + mul x20, x3, x26 + and x25, x25, x24 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x8, x21, x8 + umulh x21, x3, x26 + adcs x9, x22, x9 + umulh x22, x3, x23 + adc x25, x25, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x23 + umulh x23, x4, x23 + adds x20, x20, x22 + adc x21, x21, xzr + + mul x24, x4, x26 + umulh x26, x4, x26 + adds x20, x20, x3 + adcs x21, x21, x23 + adc x22, xzr, xzr + + adds x21, x21, x24 + adc x22, x22, x26 + + ldp x3, x4, [x2,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x21, x8, x21 + umulh x24, x3, x10 + umulh x26, x3, x11 + adcs x22, x9, x22 + mul x8, x3, x10 + mul x9, x3, x11 + adc x25, x25, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x9, x9, x24 + adc x26, x26, xzr + + mul x23, x4, x11 + umulh x11, x4, x11 + adds x9, x9, x3 + adcs x26, x26, x10 + adc x24, xzr, xzr + + adds x26, x26, x23 + adc x24, x24, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x19, x19, x8 + sbcs x20, x20, x9 + sbcs x21, x21, x26 + mul x4, x5, x13 + umulh x23, x5, x13 + sbcs x22, x22, x24 + sbc x25, x25, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x23, x23, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x23, x23, x12 + adc x10, xzr, xzr + + adds x23, x23, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x19, x19, x3 + sbcs x20, x20, x4 + sbcs x21, x21, x23 + sbcs x22, x22, x10 + sbc x25, x25, xzr + + adds x19, x19, x26 + adcs x20, x20, x24 + adcs x21, x21, x3 + adcs x22, x22, x4 + adcs x23, x25, x23 + adc x24, x10, xzr + + + // x15-x19, x7 <- (AH+AL) x (BH+BL), final step + adds x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adc x7, x7, xzr + + // Load AL + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + // Load BL + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + + // Temporarily store x8 in x2 + stp x8, x9, [x2,#0] + // x21-x28 <- AL x BL + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x8, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x27, x10, x12 + adcs x9, x11, x13 + adc x28, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x23, xzr, x8 + sub x24, xzr, x28 + and x21, x27, x23 + and x22, x9, x23 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x25, x3, x24 + and x26, x4, x24 + mul x23, x3, x27 + mul x24, x3, x9 + and x8, x8, x28 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x21, x25, x21 + umulh x25, x3, x9 + adcs x22, x26, x22 + umulh x26, x3, x27 + adc x8, x8, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x27 + umulh x27, x4, x27 + adds x24, x24, x26 + adc x25, x25, xzr + + mul x28, x4, x9 + umulh x9, x4, x9 + adds x24, x24, x3 + adcs x25, x25, x27 + adc x26, xzr, xzr + + adds x25, x25, x28 + adc x26, x26, x9 + + ldp x3, x4, [x0,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x25, x21, x25 + umulh x28, x3, x10 + umulh x9, x3, x11 + adcs x26, x22, x26 + mul x21, x3, x10 + mul x22, x3, x11 + adc x8, x8, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x22, x22, x28 + adc x9, x9, xzr + + mul x27, x4, x11 + umulh x11, x4, x11 + adds x22, x22, x3 + adcs x9, x9, x10 + adc x28, xzr, xzr + + adds x9, x9, x27 + adc x28, x28, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x23, x23, x21 + sbcs x24, x24, x22 + sbcs x25, x25, x9 + mul x4, x5, x13 + umulh x27, x5, x13 + sbcs x26, x26, x28 + sbc x8, x8, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x27, x27, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x27, x27, x12 + adc x10, xzr, xzr + + adds x27, x27, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x23, x23, x3 + sbcs x24, x24, x4 + sbcs x25, x25, x27 + sbcs x26, x26, x10 + sbc x8, x8, xzr + + adds x23, x23, x9 + adcs x24, x24, x28 + adcs x25, x25, x3 + adcs x26, x26, x4 + adcs x27, x8, x27 + adc x28, x10, xzr + + // Restore x8 + ldp x8, x9, [x2,#0] + + // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + sbc x7, x7, xzr + + // Store ALxBL, low + stp x21, x22, [x2] + stp x23, x24, [x2,#16] + + // Load AH + ldp x3, x4, [x0,#32] + ldr x5, [x0,#48] + // Load BH + ldp x10, x11, [x1,#32] + ldr x12, [x1,#48] + + adds x8, x8, x25 + adcs x9, x9, x26 + adcs x19, x19, x27 + adcs x20, x20, x28 + adc x1, xzr, xzr + + add x0, x0, #32 + // Temporarily store x8,x9 in x2 + stp x8,x9, [x2,#32] + // x21-x28 <- AH x BH + + // A0 * B0 + mul x21, x3, x10 // C0 + umulh x24, x3, x10 + + // A0 * B1 + mul x22, x3, x11 + umulh x23, x3, x11 + + // A1 * B0 + mul x8, x4, x10 + umulh x9, x4, x10 + adds x22, x22, x24 + adc x23, x23, xzr + + // A0 * B2 + mul x27, x3, x12 + umulh x28, x3, x12 + adds x22, x22, x8 // C1 + adcs x23, x23, x9 + adc x24, xzr, xzr + + // A2 * B0 + mul x8, x5, x10 + umulh x25, x5, x10 + adds x23, x23, x27 + adcs x24, x24, x25 + adc x25, xzr, xzr + + // A1 * B1 + mul x27, x4, x11 + umulh x9, x4, x11 + adds x23, x23, x8 + adcs x24, x24, x28 + adc x25, x25, xzr + + // A1 * B2 + mul x8, x4, x12 + umulh x28, x4, x12 + adds x23, x23, x27 // C2 + adcs x24, x24, x9 + adc x25, x25, xzr + + // A2 * B1 + mul x27, x5, x11 + umulh x9, x5, x11 + adds x24, x24, x8 + adcs x25, x25, x28 + adc x26, xzr, xzr + + // A2 * B2 + mul x8, x5, x12 + umulh x28, x5, x12 + adds x24, x24, x27 // C3 + adcs x25, x25, x9 + adc x26, x26, xzr + + adds x25, x25, x8 // C4 + adc x26, x26, x28 // C5 + + // Restore x8,x9 + ldp x8,x9, [x2,#32] + + neg x1, x1 + + // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, xzr + sbcs x17, x17, xzr + sbc x7, x7, xzr + + // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low + stp x8, x9, [x2,#32] + stp x19, x20, [x2,#48] + + adds x1, x1, #1 + adcs x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adcs x25, x7, x25 + adc x26, x26, xzr + + stp x14, x15, [x2,#64] + stp x16, x17, [x2,#80] + stp x25, x26, [x2,#96] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl sike_fprdc +.hidden sike_fprdc +.align 4 +sike_fprdc: + stp x29, x30, [sp, #-96]! + add x29, sp, xzr + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + adrp x26, .Lp434p1 + add x26, x26, :lo12:.Lp434p1 + ldp x23, x24, [x26, #0x0] + ldp x25, x26, [x26,#0x10] + + // a[0-1] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x10, x3, x23 + umulh x11, x3, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x11 + adc x7, xzr, xzr + + mul x10, x3, x24 + umulh x11, x3, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x11 + adc x8, x8, xzr + + mul x10, x3, x25 + umulh x11, x3, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x3, x26 + umulh x28, x3, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x11 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + ldp x10, x11, [x0, #0x18] + ldp x12, x13, [x0, #0x28] + ldp x14, x15, [x0, #0x38] + ldp x16, x17, [x0, #0x48] + ldp x19, x20, [x0, #0x58] + ldr x21, [x0, #0x68] + + adds x10, x10, x4 + adcs x11, x11, x5 + adcs x12, x12, x6 + adcs x13, x13, x7 + adcs x14, x14, x8 + adcs x15, x15, x9 + adcs x22, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + ldr x2, [x0,#0x10] // a[2] + // a[2-3] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x0, x10, x23 + umulh x3, x10, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x0 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x0, x10, x24 + umulh x3, x10, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x0 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x0, x10, x25 + umulh x3, x10, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x10, x26 + umulh x28, x10, x26 + adds x7, x7, x0 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + adds x12, x12, x4 + adcs x13, x13, x5 + adcs x14, x14, x6 + adcs x15, x15, x7 + adcs x16, x22, x8 + adcs x17, x17, x9 + adcs x22, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + mul x4, x11, x23 // C0 + umulh x7, x11, x23 + + mul x5, x11, x24 + umulh x6, x11, x24 + + mul x10, x12, x23 + umulh x3, x12, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x11, x25 + umulh x28, x11, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x10, x12, x24 + umulh x3, x12, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x11, x26 + umulh x28, x11, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x10, x12, x25 + umulh x3, x12, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x12, x26 + umulh x28, x12, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + adds x14, x14, x4 + adcs x15, x15, x5 + adcs x16, x16, x6 + adcs x17, x17, x7 + adcs x19, x22, x8 + adcs x20, x20, x9 + adc x22, x21, xzr + + stp x14, x15, [x1, #0x0] // C0, C1 + + mul x4, x13, x23 // C0 + umulh x10, x13, x23 + + mul x5, x13, x24 + umulh x27, x13, x24 + adds x5, x5, x10 // C1 + adc x10, xzr, xzr + + mul x6, x13, x25 + umulh x28, x13, x25 + adds x27, x10, x27 + adcs x6, x6, x27 // C2 + adc x10, xzr, xzr + + mul x7, x13, x26 + umulh x8, x13, x26 + adds x28, x10, x28 + adcs x7, x7, x28 // C3 + adc x8, x8, xzr // C4 + + adds x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + adcs x20, x20, x7 + adc x21, x22, x8 + + str x16, [x1, #0x10] + stp x17, x19, [x1, #0x18] + stp x20, x21, [x1, #0x28] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl sike_fpadd +.hidden sike_fpadd +.align 4 +sike_fpadd: + stp x29,x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + // Subtract 2xp434 + adrp x17, .Lp434x2 + add x17, x17, :lo12:.Lp434x2 + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbc x0, xzr, xzr // x0 can be reused now + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl sike_fpsub +.hidden sike_fpsub +.align 4 +sike_fpsub: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbc x0, xzr, xzr + + // Add 2xp434 anded with the mask in x0 + adrp x17, .Lp434x2 + add x17, x17, :lo12:.Lp434x2 + + // First half + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl sike_mpadd_asm +.hidden sike_mpadd_asm +.align 4 +sike_mpadd_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl sike_mpsubx2_asm +.hidden sike_mpsubx2_asm +.align 4 +sike_mpsubx2_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + sbcs x10, x10, x14 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x11, x12, [x1,#96] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +.globl sike_mpdblsubx2_asm +.hidden sike_mpdblsubx2_asm +.align 4 +sike_mpdblsubx2_asm: + stp x29, x30, [sp, #-16]! + add x29, sp, #0 + + ldp x3, x4, [x2, #0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + + ldp x11, x12, [x0, #0] + ldp x13, x14, [x0,#16] + ldp x15, x16, [x0,#32] + + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + + // x9 stores carry + adc x9, xzr, xzr + + ldp x11, x12, [x1, #0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2, #0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + + ldp x3, x4, [x2,#48] + ldp x5, x6, [x2,#64] + ldp x7, x8, [x2,#80] + + ldp x11, x12, [x0,#48] + ldp x13, x14, [x0,#64] + ldp x15, x16, [x0,#80] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, xzr, xzr + + ldp x11, x12, [x1,#48] + ldp x13, x14, [x1,#64] + ldp x15, x16, [x1,#80] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2,#48] + stp x5, x6, [x2,#64] + stp x7, x8, [x2,#80] + + ldp x3, x4, [x2,#96] + ldp x11, x12, [x0,#96] + ldp x13, x14, [x1,#96] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + subs x3, x3, x13 + sbc x4, x4, x14 + stp x3, x4, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S index 6c947734fe..363aeee5f5 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S @@ -1,6 +1,24 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + .text #if defined(__thumb2__) || defined(__clang__) .syntax unified @@ -1471,3 +1489,5 @@ ChaCha20_neon: .comm OPENSSL_armcap_P,4,4 #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S index d401fc78f1..cfe2a36649 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -45,6 +59,11 @@ # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + .text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified @@ -160,23 +179,23 @@ AES_Te: .word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 .size AES_Te,.-AES_Te -@ void asm_AES_encrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,%function +@ void aes_nohw_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,%function .align 5 -asm_AES_encrypt: +aes_nohw_encrypt: #ifndef __thumb2__ - sub r3,pc,#8 @ asm_AES_encrypt + sub r3,pc,#8 @ aes_nohw_encrypt #else adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r10,AES_Te #else - sub r10,r3,#asm_AES_encrypt-AES_Te @ Te + sub r10,r3,#aes_nohw_encrypt-AES_Te @ Te #endif mov r12,r0 @ inp mov r11,r2 @@ -273,7 +292,7 @@ asm_AES_encrypt: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size asm_AES_encrypt,.-asm_AES_encrypt +.size aes_nohw_encrypt,.-aes_nohw_encrypt .type _armv4_AES_encrypt,%function .align 2 @@ -412,14 +431,14 @@ _armv4_AES_encrypt: ldr pc,[sp],#4 @ pop and return .size _armv4_AES_encrypt,.-_armv4_AES_encrypt -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,%function +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,%function .align 5 -asm_AES_set_encrypt_key: +aes_nohw_set_encrypt_key: _armv4_AES_set_encrypt_key: #ifndef __thumb2__ - sub r3,pc,#8 @ asm_AES_set_encrypt_key + sub r3,pc,#8 @ aes_nohw_set_encrypt_key #else adr r3,. #endif @@ -452,7 +471,7 @@ _armv4_AES_set_encrypt_key: mov lr,r1 @ bits mov r11,r2 @ key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r10,AES_Te+1024 @ Te4 #else sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 @@ -717,23 +736,23 @@ _armv4_AES_set_encrypt_key: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key +.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,%function +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,%function .align 5 -asm_AES_set_decrypt_key: +aes_nohw_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 ldr lr,[sp],#4 @ pop lr bne .Labrt - mov r0,r2 @ asm_AES_set_encrypt_key preserves r2, + mov r0,r2 @ aes_nohw_set_encrypt_key preserves r2, mov r1,r2 @ which is AES_KEY *key b _armv4_AES_set_enc2dec_key -.size asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key +.size aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key @ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) .globl AES_set_enc2dec_key @@ -935,23 +954,23 @@ AES_Td: .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d .size AES_Td,.-AES_Td -@ void asm_AES_decrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,%function +@ void aes_nohw_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,%function .align 5 -asm_AES_decrypt: +aes_nohw_decrypt: #ifndef __thumb2__ - sub r3,pc,#8 @ asm_AES_decrypt + sub r3,pc,#8 @ aes_nohw_decrypt #else adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r10,AES_Td #else - sub r10,r3,#asm_AES_decrypt-AES_Td @ Td + sub r10,r3,#aes_nohw_decrypt-AES_Td @ Td #endif mov r12,r0 @ inp mov r11,r2 @@ -1048,7 +1067,7 @@ asm_AES_decrypt: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size asm_AES_decrypt,.-asm_AES_decrypt +.size aes_nohw_decrypt,.-aes_nohw_decrypt .type _armv4_AES_decrypt,%function .align 2 @@ -1199,3 +1218,5 @@ _armv4_AES_decrypt: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S index 7c7ef19c79..5d6e22d029 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include #if __ARM_MAX_ARCH__>=7 @@ -13,6 +27,8 @@ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b +.text + .globl aes_hw_set_encrypt_key .hidden aes_hw_set_encrypt_key .type aes_hw_set_encrypt_key,%function @@ -761,3 +777,5 @@ aes_hw_ctr32_encrypt_blocks: .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S index e77a9ea613..029689475b 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S @@ -1,6 +1,24 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + .text #if defined(__thumb2__) .syntax unified @@ -167,14 +185,15 @@ bn_mul_mont: mov r4,sp @ "rewind" r4 sub r2,r2,r5 @ "rewind" r2 - and r1,r4,r14 - bic r3,r2,r14 - orr r1,r1,r3 @ ap=borrow?tp:rp - -.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh +.Lcopy: ldr r7,[r4] @ conditional copy + ldr r5,[r2] str sp,[r4],#4 @ zap tp - str r7,[r2],#4 - cmp r4,r0 +#ifdef __thumb2__ + it cc +#endif + movcc r5,r7 + str r5,[r2],#4 + teq r4,r0 @ preserve carry bne .Lcopy mov sp,r0 @@ -954,3 +973,5 @@ bn_mul8x_mont_neon: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S index f9c6de73ff..69a8fcacd0 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -14,8 +28,7 @@ @ details see http://www.openssl.org/~appro/cryptogams/. @ @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ . Permission to use under GPL terms is -@ granted. +@ of Linaro. Permission to use under GPL terms is granted. @ ==================================================================== @ Bit-sliced AES for ARM NEON @@ -49,10 +62,7 @@ @ @ April-August 2013 -@ -@ Add CBC, CTR and XTS subroutines, adapt for kernel use. -@ -@ +@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. #ifndef __KERNEL__ # include @@ -92,7 +102,7 @@ _bsaes_decrypt8: adr r6,. vldmia r4!, {q9} @ round 0 key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0ISR #else add r6,r6,#.LM0ISR-_bsaes_decrypt8 @@ -583,7 +593,7 @@ _bsaes_const: _bsaes_encrypt8: adr r6,. vldmia r4!, {q9} @ round 0 key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0SR #else sub r6,r6,#_bsaes_encrypt8-.LM0SR @@ -1018,7 +1028,7 @@ _bsaes_encrypt8_bitslice: _bsaes_key_convert: adr r6,. vld1.8 {q7}, [r4]! @ load round 0 key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0 #else sub r6,r6,#_bsaes_key_convert-.LM0 @@ -1072,24 +1082,13 @@ _bsaes_key_convert: @ don't save last round key bx lr .size _bsaes_key_convert,.-_bsaes_key_convert - - - .globl bsaes_cbc_encrypt .hidden bsaes_cbc_encrypt .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: -#ifndef __KERNEL__ - cmp r2, #128 -#ifndef __thumb__ - blo AES_cbc_encrypt -#else - bhs 1f - b AES_cbc_encrypt -1: -#endif -#endif + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. @ it is up to the caller to make sure we are called with enc == 0 @@ -1187,10 +1186,7 @@ bsaes_cbc_encrypt: adds r2, r2, #8 beq .Lcbc_dec_done - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo .Lcbc_dec_one - vld1.8 {q1}, [r0]! + @ Set up most parameters for the _bsaes_decrypt8 call. #ifndef BSAES_ASM_EXTENDED_KEY mov r4, sp @ pass the key #else @@ -1198,6 +1194,11 @@ bsaes_cbc_encrypt: #endif mov r5, r10 vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo .Lcbc_dec_one + vld1.8 {q1}, [r0]! beq .Lcbc_dec_two vld1.8 {q2}, [r0]! cmp r2, #4 @@ -1315,16 +1316,11 @@ bsaes_cbc_encrypt: .align 4 .Lcbc_dec_one: sub r0, r0, #0x10 - mov r10, r1 @ save original out pointer - mov r1, r9 @ use the iv scratch space as out buffer - mov r2, r3 - vmov q4,q15 @ just in case ensure that IV - vmov q5,q0 @ and input are preserved - bl AES_decrypt - vld1.8 {q0}, [r9] @ load result - veor q0, q0, q4 @ ^= IV - vmov q15, q5 @ q5 holds input - vst1.8 {q0}, [r10] @ write output + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output .Lcbc_dec_done: #ifndef BSAES_ASM_EXTENDED_KEY @@ -1342,15 +1338,13 @@ bsaes_cbc_encrypt: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - .globl bsaes_ctr32_encrypt_blocks .hidden bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: - cmp r2, #8 @ use plain AES for - blo .Lctr_enc_short @ small sizes - + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} VFP_ABI_PUSH @@ -1526,1042 +1520,10 @@ bsaes_ctr32_encrypt_blocks: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.align 4 -.Lctr_enc_short: - ldr ip, [sp] @ ctr pointer is passed on stack - stmdb sp!, {r4,r5,r6,r7,r8, lr} - - mov r4, r0 @ copy arguments - mov r5, r1 - mov r6, r2 - mov r7, r3 - ldr r8, [ip, #12] @ load counter .LSW - vld1.8 {q1}, [ip] @ load whole counter value -#ifdef __ARMEL__ - rev r8, r8 -#endif - sub sp, sp, #0x10 - vst1.8 {q1}, [sp] @ copy counter value - sub sp, sp, #0x10 - -.Lctr_enc_short_loop: - add r0, sp, #0x10 @ input counter value - mov r1, sp @ output on the stack - mov r2, r7 @ key - - bl AES_encrypt - - vld1.8 {q0}, [r4]! @ load input - vld1.8 {q1}, [sp] @ load encrypted counter - add r8, r8, #1 -#ifdef __ARMEL__ - rev r0, r8 - str r0, [sp, #0x1c] @ next counter value -#else - str r8, [sp, #0x1c] @ next counter value -#endif - veor q0,q0,q1 - vst1.8 {q0}, [r5]! @ store output - subs r6, r6, #1 - bne .Lctr_enc_short_loop - - vmov.i32 q0, #0 - vmov.i32 q1, #0 - vstmia sp!, {q0,q1} - - ldmia sp!, {r4,r5,r6,r7,r8, pc} + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.hidden bsaes_xts_encrypt -.type bsaes_xts_encrypt,%function -.align 4 -bsaes_xts_encrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl AES_encrypt - mov r0,sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} @ save last round key -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - - vld1.8 {q8}, [r0] @ initial tweak - adr r2, .Lxts_magic - - subs r9, #0x80 - blo .Lxts_enc_short - b .Lxts_enc_loop - -.align 4 -.Lxts_enc_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl .Lxts_enc_loop - -.Lxts_enc_short: - adds r9, #0x70 - bmi .Lxts_enc_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_6: - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done - -@ put this in range for both ARM and Thumb mode adr instructions -.align 5 -.Lxts_magic: -.quad 1, 0x87 - -.align 5 -.Lxts_enc_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - - vmov q8, q9 @ next round tweak - -.Lxts_enc_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq .Lxts_enc_ret - sub r6, r8, #0x10 - -.Lxts_enc_steal: - ldrb r0, [r7], #1 - ldrb r1, [r8, #-0x10] - strb r0, [r8, #-0x10] - strb r1, [r8], #1 - - subs r9, #1 - bhi .Lxts_enc_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -.Lxts_enc_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -.Lxts_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne .Lxts_enc_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.hidden bsaes_xts_decrypt -.type bsaes_xts_decrypt,%function -.align 4 -bsaes_xts_decrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl AES_encrypt - mov r0, sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - add r4, sp, #0x90 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r10, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - vld1.8 {q8}, [r0] @ initial tweak - adr r2, .Lxts_magic - -#ifndef XTS_CHAIN_TWEAK - tst r9, #0xf @ if not multiple of 16 - it ne @ Thumb2 thing, sanity check in ARM - subne r9, #0x10 @ subtract another 16 bytes -#endif - subs r9, #0x80 - - blo .Lxts_dec_short - b .Lxts_dec_loop - -.align 4 -.Lxts_dec_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl .Lxts_dec_loop - -.Lxts_dec_short: - adds r9, #0x70 - bmi .Lxts_dec_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_6: - vst1.64 {q14}, [r0,:128] @ next round tweak - - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r5, r2 @ preserve magic - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - mov r2, r5 - - vmov q8, q9 @ next round tweak - -.Lxts_dec_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq .Lxts_dec_ret - - @ calculate one round of extra tweak for the stolen ciphertext - vldmia r2, {q5} - vshr.s64 q6, q8, #63 - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vswp d13,d12 - veor q9, q9, q6 - - @ perform the final decryption with the last tweak value - vld1.8 {q0}, [r7]! - mov r0, sp - veor q0, q0, q9 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q9 - vst1.8 {q0}, [r8] - - mov r6, r8 -.Lxts_dec_steal: - ldrb r1, [r8] - ldrb r0, [r7], #1 - strb r1, [r8, #0x10] - strb r0, [r8], #1 - - subs r9, #1 - bhi .Lxts_dec_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -.Lxts_dec_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -.Lxts_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne .Lxts_dec_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S index 5f8b50d551..42cce5831f 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S @@ -1,9 +1,30 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + .text #if defined(__thumb2__) || defined(__clang__) .syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne #endif #if defined(__thumb2__) .thumb @@ -11,11 +32,6 @@ .code 32 #endif -#ifdef __clang__ -#define ldrplb ldrbpl -#define ldrneb ldrbne -#endif - .type rem_4bit,%object .align 5 rem_4bit: @@ -571,3 +587,5 @@ gcm_ghash_neon: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S index e83a9c7313..d6842945f0 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text @@ -109,13 +123,13 @@ gcm_ghash_v8: @ loaded value would have @ to be rotated in order to @ make it appear as in - @ alorithm specification + @ algorithm specification subs r3,r3,#32 @ see if r3 is 32 or larger mov r12,#16 @ r12 is used as post- @ increment for input pointer; @ as loop is modulo-scheduled @ r12 is zeroed just in time - @ to preclude oversteping + @ to preclude overstepping @ inp[len], which means that @ last block[s] are actually @ loaded twice, but last @@ -235,3 +249,5 @@ gcm_ghash_v8: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S index a5d88f71e2..61deddf8e7 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text @@ -1493,3 +1507,5 @@ sha1_block_data_order_armv8: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S index f37fd7c7cf..aee04785c0 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -51,6 +65,11 @@ # define __ARM_MAX_ARCH__ 7 #endif +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) +.arch armv7-a + .text #if defined(__thumb2__) .syntax unified @@ -2816,3 +2835,5 @@ sha256_block_data_order_armv8: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S index bbeddf9220..a06d41fee5 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -64,6 +78,10 @@ # define VFP_ABI_POP #endif +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + #ifdef __ARMEL__ # define LO 0 # define HI 4 @@ -1872,3 +1890,5 @@ sha512_block_data_order_neon: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S new file mode 100644 index 0000000000..e5ad6ed99b --- /dev/null +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S @@ -0,0 +1,1236 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + +.arch armv7-a +.fpu neon + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + +.type _vpaes_consts,%object +.align 7 @ totally strategic alignment +_vpaes_consts: +.Lk_mc_forward:@ mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:@ mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr:@ sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +.Lk_inv:@ inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt:@ input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo:@ sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1:@ sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2:@ sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +.type _vpaes_preheat,%function +.align 4 +_vpaes_preheat: + adr r10, .Lk_inv + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10]! @ .Lk_inv + add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo + vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 + vld1.64 {q14,q15}, [r10] @ .Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [r2] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + adr r11, .Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, .Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 d3, {q2}, d3 + vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 d5, {q3}, d1 + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b .Lenc_entry + +.align 4 +.Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 d9, {q13}, d5 + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 d1, {q12}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 d11, {q15}, d5 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 d5, {q14}, d7 + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 d7, {q0}, d3 + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 d11, {q0}, d9 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 d9, {q3}, d3 + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +.Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 d11, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne .Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, .Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 d5, {q0}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 d1, {q2}, d3 + bx lr +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack + @ alignment. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_encrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_encrypt,.-vpaes_encrypt + +@ +@ Decryption stuff +@ +.type _vpaes_decrypt_consts,%object +.align 4 +_vpaes_decrypt_consts: +.Lk_dipt:@ decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo:@ decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9:@ decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd:@ decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb:@ decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe:@ decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts + +@@ +@@ Decryption core +@@ +@@ Same API as encryption core, except it clobbers q12-q15 rather than using +@@ the values from _vpaes_preheat. q9-q11 must still be set from +@@ _vpaes_preheat. +@@ +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + + @ This function performs shuffles with various constants. The x86_64 + @ version loads them on-demand into %xmm0-%xmm5. This does not work well + @ for ARMv7 because those registers are shuffle destinations. The ARMv8 + @ version preloads those constants into registers, but ARMv7 has half + @ the registers to work with. Instead, we load them on-demand into + @ q12-q15, registers normally use for preloaded constants. This is fine + @ because decryption doesn't use those constants. The values are + @ constant, so this does not interfere with potential 2x optimizations. + adr r7, .Lk_dipt + + vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 + eor r11, r11, #0x30 @ xor $0x30, %r11 + adr r10, .Lk_sr + and r11, r11, #0x30 @ and $0x30, %r11 + add r11, r11, r10 + adr r10, .Lk_mc_forward+48 + + vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q12}, d3 + vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q13}, d1 + veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Ldec_entry ends with a bnz instruction which is normally paired with + @ subs in .Ldec_loop. + tst r8, r8 + b .Ldec_entry + +.align 4 +.Ldec_loop: +@ +@ Inverse mix columns +@ + + @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of + @ the function. + adr r10, .Lk_dsb9 + vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + vtbl.8 d9, {q12}, d5 + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + vtbl.8 d3, {q13}, d7 + veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 + + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + @ Load sbb* ahead of time. + vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu + @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + vtbl.8 d3, {q15}, d7 + @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu + @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet + + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + vtbl.8 d9, {q12}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + vtbl.8 d3, {q13}, d7 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + vtbl.8 d3, {q15}, d7 + vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + subs r8, r8, #1 @ sub $1,%rax # nr-- + +.Ldec_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 + bne .Ldec_loop + + @ middle of last round + + adr r10, .Lk_dsbo + + @ Write to q1 rather than q4 to avoid overlapping table and destination. + vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + @ Write to q2 rather than q1 to avoid overlapping table and destination. + vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + vtbl.8 d3, {q2}, d7 + vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + @ Write to q1 rather than q0 so the table and destination registers + @ below do not overlap. + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A + vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 + vtbl.8 d1, {q1}, d5 + bx lr +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + @ _vpaes_decrypt_core uses r7-r11. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_decrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_decrypt,.-vpaes_decrypt +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ +.type _vpaes_key_consts,%object +.align 4 +_vpaes_key_consts: +.Lk_dksd:@ decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb:@ decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9:@ decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon:@ rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt:@ output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew:@ deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 +.size _vpaes_key_consts,.-_vpaes_key_consts + +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr r11, .Lk_rcon + vmov.i8 q12, #0x5b @ .Lk_s63 + adr r10, .Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10] @ .Lk_inv + vld1.64 {q8}, [r11] @ .Lk_rcon + bx lr +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, .Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + tst r3, r3 + bne .Lschedule_am_decrypting + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + @ decrypting, output zeroth round key after shiftrows + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q4}, d3 + vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) + eor r8, r8, #0x30 @ xor $0x30, %r8 + +.Lschedule_go: + cmp r1, #192 @ cmp $192, %esi + bhi .Lschedule_256 + beq .Lschedule_192 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +.Lschedule_128: + mov r0, #10 @ mov $10, %esi + +.Loop_schedule_128: + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b .Loop_schedule_128 + +@@ +@@ .aes_schedule_192 +@@ +@@ 192-bit specific part of key schedule. +@@ +@@ The main body of this schedule is the same as the 128-bit +@@ schedule, but with more smearing. The long, high side is +@@ stored in q7 as before, and the short, low side is in +@@ the high bits of q6. +@@ +@@ This schedule is somewhat nastier, however, because each +@@ round produces 192 bits of key material, or 1.5 round keys. +@@ Therefore, on each cycle we do 2 rounds and produce 3 round +@@ keys. +@@ +.align 4 +.Lschedule_192: + sub r0, r0, #8 + vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform @ input transform + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part + vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 + @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov r0, #4 @ mov $4, %esi + +.Loop_schedule_192: + bl _vpaes_schedule_round + vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle @ save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle @ save key n+1 + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +.Lschedule_256: + vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov r0, #7 @ mov $7, %esi + +.Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +.Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew + tst r3, r3 + bne .Lschedule_mangle_last_dec + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform + add r2, r2, #32 @ add $32, %rdx + vmov q2, q0 + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 d1, {q2}, d3 + +.Lschedule_mangle_last_dec: + sub r2, r2, #16 @ add $-16, %rdx + veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ +@@ .aes_schedule_192_smear +@@ +@@ Smear the short, low side in the 192-bit key schedule. +@@ +@@ Inputs: +@@ q7: high side, b a x y +@@ q6: low side, d c 0 0 +@@ +@@ Outputs: +@@ q6: b+c+d b+c 0 0 +@@ q0: b+c+d b+c b a +@@ +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + vmov.i8 q1, #0 + vdup.32 q0, d15[1] + vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + vmov q0, q6 @ vmovdqa %xmm6, %xmm0 + vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + bx lr +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 + vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, .Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 d7, {q10}, d7 + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 d5, {q10}, d9 + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 d9, {q15}, d7 + vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 d3, {q14}, d5 + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d3 + vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q15}, d1 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + tst r3, r3 + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 + bne .Lschedule_mangle_dec + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add r2, r2, #16 @ add $16, %rdx + vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 d9, {q2}, d11 + vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 d3, {q4}, d11 + vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 d7, {q1}, d11 + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + @ inverse mix columns + adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11 + vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi + vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 + @ vmovdqa 0x10(%r11), %xmm3 + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dksb ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 + @ vmovdqa 0x30(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 + @ vmovdqa 0x50(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 + @ vmovdqa 0x70(%r11), %xmm4 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 + vtbl.8 d9, {q15}, d3 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 + + sub r2, r2, #16 @ add $-16, %rdx + +.Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d5, {q3}, d3 + add r8, r8, #64-16 @ add $-16, %r8 + and r8, r8, #~(1<<6) @ and $0x30, %r8 + vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) + bx lr +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov r3, #0 @ mov $0,%ecx + mov r8, #0x30 @ mov $0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl r9, r9, #4 @ shl $4,%eax + add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx + add r2, r2, r9 + + mov r3, #1 @ mov $1,%ecx + lsr r8, r1, #1 @ shr $1,%r8d + and r8, r8, #32 @ and $32,%r8d + eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +@ Additional constants for converting to bsaes. +.type _vpaes_convert_consts,%object +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) +.Lk_opt_then_skew: +.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b +.quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation +@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 +@ becomes 0x22334411 and then 0x11443322. +.Lk_decrypt_transform: +.quad 0x0704050603000102, 0x0f0c0d0e0b08090a +.size _vpaes_convert_consts,.-_vpaes_convert_consts + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl vpaes_encrypt_key_to_bsaes +.hidden vpaes_encrypt_key_to_bsaes +.type vpaes_encrypt_key_to_bsaes,%function +.align 4 +vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {q12}, [r2] + vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 + adr r11, .Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +.Loop_enc_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + vmov q0, q1 + vtbl.8 d4, {q1}, d24 + vtbl.8 d5, {q1}, d25 + veor q0, q0, q2 + vtbl.8 d2, {q2}, d24 + vtbl.8 d3, {q2}, d25 + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, q10 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + b .Loop_enc_key_to_bsaes + +.Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ q11, not q10. + veor q0, q0, q11 + vrev32.8 q0, q0 + vst1.64 {q0}, [r0] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes + +@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); +.globl vpaes_decrypt_key_to_bsaes +.hidden vpaes_decrypt_key_to_bsaes +.type vpaes_decrypt_key_to_bsaes,%function +.align 4 +vpaes_decrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. Note vpaes + @ computes the decryption key schedule in reverse. Additionally, + @ aes-x86_64.pl shares some transformations, so we must only partially + @ invert vpaes's transformations. In general, vpaes computes in a + @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of + @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is + @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + adr r2, .Lk_decrypt_transform + adr r3, .Lk_sr+0x30 + adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform. + vld1.64 {q12}, [r2] @ Reuse q12 from encryption. + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ Undo the basis change and reapply the S-box affine transform. See + @ .Lschedule_mangle_last. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ See _vpaes_schedule_mangle for the transform on the middle keys. Note + @ it simultaneously inverts MixColumns and the S-box affine transform. + @ See .Lk_dksd through .Lk_dks9. +.Loop_dec_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going + @ forwards cancels inverting for which direction we cycle r3. We use r3 + @ rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #64-16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_dec_key_to_bsaes_last + + @ Undo the basis change and reapply the S-box affine transform. + bl _vpaes_schedule_transform + + @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We + @ combine the two operations in .Lk_decrypt_transform. + @ + @ TODO(davidben): Where does the rotation come from? + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + + vst1.64 {q1}, [r0]! + b .Loop_dec_key_to_bsaes + +.Loop_dec_key_to_bsaes_last: + @ The final key only inverts ShiftRows (already done in the loop). See + @ .Lschedule_am_decrypting. Its basis is not transformed. + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7,r8,r9,r10,r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + cmp r2, #0 + @ r8 is passed on the stack. + ldr r8, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. + mov r9, r3 + mov r3, r2 + mov r2, r9 + + @ Load the IV and counter portion. + ldr r7, [r8, #12] + vld1.8 {q7}, [r8] + + bl _vpaes_preheat + rev r7, r7 @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [r0]! @ .Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [r1]! + subs r3, r3, #1 + @ Update the counter. + add r7, r7, #1 + rev r9, r7 + vmov.32 d15[1], r9 + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S new file mode 100644 index 0000000000..5c788b3569 --- /dev/null +++ b/packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S @@ -0,0 +1,380 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + +.arch armv7-a +.fpu vfp + +.text + +@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| +@ with |argv|, then saves the callee-saved registers into |state|. It returns +@ the result of |func|. The |unwind| argument is unused. +@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, +@ const uint32_t *argv, size_t argc, +@ int unwind); +.type abi_test_trampoline, %function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 4 +abi_test_trampoline: +.Labi_test_trampoline_begin: + @ Save parameters and all callee-saved registers. For convenience, we + @ save r9 on iOS even though it's volatile. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} + + @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 + @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). + sub sp, sp, #28 + + @ Every register in AAPCS is either non-volatile or a parameter (except + @ r9 on iOS), so this code, by the actual call, loses all its scratch + @ registers. First fill in stack parameters while there are registers + @ to spare. + cmp r3, #4 + bls .Lstack_args_done + mov r4, sp @ r4 is the output pointer. + add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. + add r2, r2, #16 @ Skip four arguments. +.Lstack_args_loop: + ldr r6, [r2], #4 + cmp r2, r5 + str r6, [r4], #4 + bne .Lstack_args_loop + +.Lstack_args_done: + @ Load registers from |r1|. + vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Load register parameters. This uses up our remaining registers, so we + @ repurpose lr as scratch space. + ldr r3, [sp, #40] @ Reload argc. + ldr lr, [sp, #36] @ .Load argv into lr. + cmp r3, #3 + bhi .Larg_r3 + beq .Larg_r2 + cmp r3, #1 + bhi .Larg_r1 + beq .Larg_r0 + b .Largs_done + +.Larg_r3: + ldr r3, [lr, #12] @ argv[3] +.Larg_r2: + ldr r2, [lr, #8] @ argv[2] +.Larg_r1: + ldr r1, [lr, #4] @ argv[1] +.Larg_r0: + ldr r0, [lr] @ argv[0] +.Largs_done: + + @ With every other register in use, load the function pointer into lr + @ and call the function. + ldr lr, [sp, #28] + blx lr + + @ r1-r3 are free for use again. The trampoline only supports + @ single-return functions. Pass r4-r11 to the caller. + ldr r1, [sp, #32] + vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + stmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Unwind the stack and restore registers. + add sp, sp, #44 @ 44 = 28+16 + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + bx lr +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_r0, %function +.globl abi_test_clobber_r0 +.hidden abi_test_clobber_r0 +.align 4 +abi_test_clobber_r0: + mov r0, #0 + bx lr +.size abi_test_clobber_r0,.-abi_test_clobber_r0 +.type abi_test_clobber_r1, %function +.globl abi_test_clobber_r1 +.hidden abi_test_clobber_r1 +.align 4 +abi_test_clobber_r1: + mov r1, #0 + bx lr +.size abi_test_clobber_r1,.-abi_test_clobber_r1 +.type abi_test_clobber_r2, %function +.globl abi_test_clobber_r2 +.hidden abi_test_clobber_r2 +.align 4 +abi_test_clobber_r2: + mov r2, #0 + bx lr +.size abi_test_clobber_r2,.-abi_test_clobber_r2 +.type abi_test_clobber_r3, %function +.globl abi_test_clobber_r3 +.hidden abi_test_clobber_r3 +.align 4 +abi_test_clobber_r3: + mov r3, #0 + bx lr +.size abi_test_clobber_r3,.-abi_test_clobber_r3 +.type abi_test_clobber_r4, %function +.globl abi_test_clobber_r4 +.hidden abi_test_clobber_r4 +.align 4 +abi_test_clobber_r4: + mov r4, #0 + bx lr +.size abi_test_clobber_r4,.-abi_test_clobber_r4 +.type abi_test_clobber_r5, %function +.globl abi_test_clobber_r5 +.hidden abi_test_clobber_r5 +.align 4 +abi_test_clobber_r5: + mov r5, #0 + bx lr +.size abi_test_clobber_r5,.-abi_test_clobber_r5 +.type abi_test_clobber_r6, %function +.globl abi_test_clobber_r6 +.hidden abi_test_clobber_r6 +.align 4 +abi_test_clobber_r6: + mov r6, #0 + bx lr +.size abi_test_clobber_r6,.-abi_test_clobber_r6 +.type abi_test_clobber_r7, %function +.globl abi_test_clobber_r7 +.hidden abi_test_clobber_r7 +.align 4 +abi_test_clobber_r7: + mov r7, #0 + bx lr +.size abi_test_clobber_r7,.-abi_test_clobber_r7 +.type abi_test_clobber_r8, %function +.globl abi_test_clobber_r8 +.hidden abi_test_clobber_r8 +.align 4 +abi_test_clobber_r8: + mov r8, #0 + bx lr +.size abi_test_clobber_r8,.-abi_test_clobber_r8 +.type abi_test_clobber_r9, %function +.globl abi_test_clobber_r9 +.hidden abi_test_clobber_r9 +.align 4 +abi_test_clobber_r9: + mov r9, #0 + bx lr +.size abi_test_clobber_r9,.-abi_test_clobber_r9 +.type abi_test_clobber_r10, %function +.globl abi_test_clobber_r10 +.hidden abi_test_clobber_r10 +.align 4 +abi_test_clobber_r10: + mov r10, #0 + bx lr +.size abi_test_clobber_r10,.-abi_test_clobber_r10 +.type abi_test_clobber_r11, %function +.globl abi_test_clobber_r11 +.hidden abi_test_clobber_r11 +.align 4 +abi_test_clobber_r11: + mov r11, #0 + bx lr +.size abi_test_clobber_r11,.-abi_test_clobber_r11 +.type abi_test_clobber_r12, %function +.globl abi_test_clobber_r12 +.hidden abi_test_clobber_r12 +.align 4 +abi_test_clobber_r12: + mov r12, #0 + bx lr +.size abi_test_clobber_r12,.-abi_test_clobber_r12 +.type abi_test_clobber_d0, %function +.globl abi_test_clobber_d0 +.hidden abi_test_clobber_d0 +.align 4 +abi_test_clobber_d0: + mov r0, #0 + vmov s0, r0 + vmov s1, r0 + bx lr +.size abi_test_clobber_d0,.-abi_test_clobber_d0 +.type abi_test_clobber_d1, %function +.globl abi_test_clobber_d1 +.hidden abi_test_clobber_d1 +.align 4 +abi_test_clobber_d1: + mov r0, #0 + vmov s2, r0 + vmov s3, r0 + bx lr +.size abi_test_clobber_d1,.-abi_test_clobber_d1 +.type abi_test_clobber_d2, %function +.globl abi_test_clobber_d2 +.hidden abi_test_clobber_d2 +.align 4 +abi_test_clobber_d2: + mov r0, #0 + vmov s4, r0 + vmov s5, r0 + bx lr +.size abi_test_clobber_d2,.-abi_test_clobber_d2 +.type abi_test_clobber_d3, %function +.globl abi_test_clobber_d3 +.hidden abi_test_clobber_d3 +.align 4 +abi_test_clobber_d3: + mov r0, #0 + vmov s6, r0 + vmov s7, r0 + bx lr +.size abi_test_clobber_d3,.-abi_test_clobber_d3 +.type abi_test_clobber_d4, %function +.globl abi_test_clobber_d4 +.hidden abi_test_clobber_d4 +.align 4 +abi_test_clobber_d4: + mov r0, #0 + vmov s8, r0 + vmov s9, r0 + bx lr +.size abi_test_clobber_d4,.-abi_test_clobber_d4 +.type abi_test_clobber_d5, %function +.globl abi_test_clobber_d5 +.hidden abi_test_clobber_d5 +.align 4 +abi_test_clobber_d5: + mov r0, #0 + vmov s10, r0 + vmov s11, r0 + bx lr +.size abi_test_clobber_d5,.-abi_test_clobber_d5 +.type abi_test_clobber_d6, %function +.globl abi_test_clobber_d6 +.hidden abi_test_clobber_d6 +.align 4 +abi_test_clobber_d6: + mov r0, #0 + vmov s12, r0 + vmov s13, r0 + bx lr +.size abi_test_clobber_d6,.-abi_test_clobber_d6 +.type abi_test_clobber_d7, %function +.globl abi_test_clobber_d7 +.hidden abi_test_clobber_d7 +.align 4 +abi_test_clobber_d7: + mov r0, #0 + vmov s14, r0 + vmov s15, r0 + bx lr +.size abi_test_clobber_d7,.-abi_test_clobber_d7 +.type abi_test_clobber_d8, %function +.globl abi_test_clobber_d8 +.hidden abi_test_clobber_d8 +.align 4 +abi_test_clobber_d8: + mov r0, #0 + vmov s16, r0 + vmov s17, r0 + bx lr +.size abi_test_clobber_d8,.-abi_test_clobber_d8 +.type abi_test_clobber_d9, %function +.globl abi_test_clobber_d9 +.hidden abi_test_clobber_d9 +.align 4 +abi_test_clobber_d9: + mov r0, #0 + vmov s18, r0 + vmov s19, r0 + bx lr +.size abi_test_clobber_d9,.-abi_test_clobber_d9 +.type abi_test_clobber_d10, %function +.globl abi_test_clobber_d10 +.hidden abi_test_clobber_d10 +.align 4 +abi_test_clobber_d10: + mov r0, #0 + vmov s20, r0 + vmov s21, r0 + bx lr +.size abi_test_clobber_d10,.-abi_test_clobber_d10 +.type abi_test_clobber_d11, %function +.globl abi_test_clobber_d11 +.hidden abi_test_clobber_d11 +.align 4 +abi_test_clobber_d11: + mov r0, #0 + vmov s22, r0 + vmov s23, r0 + bx lr +.size abi_test_clobber_d11,.-abi_test_clobber_d11 +.type abi_test_clobber_d12, %function +.globl abi_test_clobber_d12 +.hidden abi_test_clobber_d12 +.align 4 +abi_test_clobber_d12: + mov r0, #0 + vmov s24, r0 + vmov s25, r0 + bx lr +.size abi_test_clobber_d12,.-abi_test_clobber_d12 +.type abi_test_clobber_d13, %function +.globl abi_test_clobber_d13 +.hidden abi_test_clobber_d13 +.align 4 +abi_test_clobber_d13: + mov r0, #0 + vmov s26, r0 + vmov s27, r0 + bx lr +.size abi_test_clobber_d13,.-abi_test_clobber_d13 +.type abi_test_clobber_d14, %function +.globl abi_test_clobber_d14 +.hidden abi_test_clobber_d14 +.align 4 +abi_test_clobber_d14: + mov r0, #0 + vmov s28, r0 + vmov s29, r0 + bx lr +.size abi_test_clobber_d14,.-abi_test_clobber_d14 +.type abi_test_clobber_d15, %function +.globl abi_test_clobber_d15 +.hidden abi_test_clobber_d15 +.align 4 +abi_test_clobber_d15: + mov r0, #0 + vmov s30, r0 + vmov s31, r0 + bx lr +.size abi_test_clobber_d15,.-abi_test_clobber_d15 +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S new file mode 100644 index 0000000000..86b06fc2ef --- /dev/null +++ b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S @@ -0,0 +1,3670 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) +.machine "any" + +.abiversion 2 +.text + +.align 7 +.Lrcon: +.byte 0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01 +.byte 0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b +.byte 0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d +.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.Lconsts: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,-0x48 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.byte 65,69,83,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.globl aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function +.align 5 +aes_hw_set_encrypt_key: +.localentry aes_hw_set_encrypt_key,0 + +.Lset_encrypt_key: + mflr 11 + std 11,16(1) + + li 6,-1 + cmpldi 3,0 + beq- .Lenc_key_abort + cmpldi 5,0 + beq- .Lenc_key_abort + li 6,-2 + cmpwi 4,128 + blt- .Lenc_key_abort + cmpwi 4,256 + bgt- .Lenc_key_abort + andi. 0,4,0x3f + bne- .Lenc_key_abort + + lis 0,0xfff0 + li 12,-1 + or 0,0,0 + + bl .Lconsts + mtlr 11 + + neg 9,3 + lvx 1,0,3 + addi 3,3,15 + lvsr 3,0,9 + li 8,0x20 + cmpwi 4,192 + lvx 2,0,3 + vspltisb 5,0x0f + lvx 4,0,6 + vxor 3,3,5 + lvx 5,8,6 + addi 6,6,0x10 + vperm 1,1,2,3 + li 7,8 + vxor 0,0,0 + mtctr 7 + + lvsl 8,0,5 + vspltisb 9,-1 + lvx 10,0,5 + vperm 9,9,0,8 + + blt .Loop128 + addi 3,3,8 + beq .L192 + addi 3,3,8 + b .L256 + +.align 4 +.Loop128: + vperm 3,1,1,5 + vsldoi 6,0,1,12 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vadduwm 4,4,4 + vxor 1,1,3 + bdnz .Loop128 + + lvx 4,0,6 + + vperm 3,1,1,5 + vsldoi 6,0,1,12 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vadduwm 4,4,4 + vxor 1,1,3 + + vperm 3,1,1,5 + vsldoi 6,0,1,12 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vxor 1,1,3 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + + addi 3,5,15 + addi 5,5,0x50 + + li 8,10 + b .Ldone + +.align 4 +.L192: + lvx 6,0,3 + li 7,4 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 5,5,16 + vperm 2,2,6,3 + vspltisb 3,8 + mtctr 7 + vsububm 5,5,3 + +.Loop192: + vperm 3,2,2,5 + vsldoi 6,0,1,12 + .long 0x10632509 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + + vsldoi 7,0,2,8 + vspltw 6,1,3 + vxor 6,6,2 + vsldoi 2,0,2,12 + vadduwm 4,4,4 + vxor 2,2,6 + vxor 1,1,3 + vxor 2,2,3 + vsldoi 7,7,1,8 + + vperm 3,2,2,5 + vsldoi 6,0,1,12 + vperm 11,7,7,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vsldoi 7,1,2,8 + vxor 1,1,6 + vsldoi 6,0,6,12 + vperm 11,7,7,8 + vsel 7,10,11,9 + vor 10,11,11 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + stvx 7,0,5 + addi 5,5,16 + + vspltw 6,1,3 + vxor 6,6,2 + vsldoi 2,0,2,12 + vadduwm 4,4,4 + vxor 2,2,6 + vxor 1,1,3 + vxor 2,2,3 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 3,5,15 + addi 5,5,16 + bdnz .Loop192 + + li 8,12 + addi 5,5,0x20 + b .Ldone + +.align 4 +.L256: + lvx 6,0,3 + li 7,7 + li 8,14 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 5,5,16 + vperm 2,2,6,3 + mtctr 7 + +.Loop256: + vperm 3,2,2,5 + vsldoi 6,0,1,12 + vperm 11,2,2,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vadduwm 4,4,4 + vxor 1,1,3 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 3,5,15 + addi 5,5,16 + bdz .Ldone + + vspltw 3,1,3 + vsldoi 6,0,2,12 + .long 0x106305C8 + + vxor 2,2,6 + vsldoi 6,0,6,12 + vxor 2,2,6 + vsldoi 6,0,6,12 + vxor 2,2,6 + + vxor 2,2,3 + b .Loop256 + +.align 4 +.Ldone: + lvx 2,0,3 + vsel 2,10,2,9 + stvx 2,0,3 + li 6,0 + or 12,12,12 + stw 8,0(5) + +.Lenc_key_abort: + mr 3,6 + blr +.long 0 +.byte 0,12,0x14,1,0,0,3,0 +.long 0 +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key + +.globl aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function +.align 5 +aes_hw_set_decrypt_key: +.localentry aes_hw_set_decrypt_key,0 + + stdu 1,-64(1) + mflr 10 + std 10,80(1) + bl .Lset_encrypt_key + mtlr 10 + + cmpwi 3,0 + bne- .Ldec_key_abort + + slwi 7,8,4 + subi 3,5,240 + srwi 8,8,1 + add 5,3,7 + mtctr 8 + +.Ldeckey: + lwz 0, 0(3) + lwz 6, 4(3) + lwz 7, 8(3) + lwz 8, 12(3) + addi 3,3,16 + lwz 9, 0(5) + lwz 10,4(5) + lwz 11,8(5) + lwz 12,12(5) + stw 0, 0(5) + stw 6, 4(5) + stw 7, 8(5) + stw 8, 12(5) + subi 5,5,16 + stw 9, -16(3) + stw 10,-12(3) + stw 11,-8(3) + stw 12,-4(3) + bdnz .Ldeckey + + xor 3,3,3 +.Ldec_key_abort: + addi 1,1,64 + blr +.long 0 +.byte 0,12,4,1,0x80,0,3,0 +.long 0 +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_encrypt +.type aes_hw_encrypt,@function +.align 5 +aes_hw_encrypt: +.localentry aes_hw_encrypt,0 + + lwz 6,240(5) + lis 0,0xfc00 + li 12,-1 + li 7,15 + or 0,0,0 + + lvx 0,0,3 + neg 11,4 + lvx 1,7,3 + lvsl 2,0,3 + vspltisb 4,0x0f + lvsr 3,0,11 + vxor 2,2,4 + li 7,16 + vperm 0,0,1,2 + lvx 1,0,5 + lvsr 5,0,5 + srwi 6,6,1 + lvx 2,7,5 + addi 7,7,16 + subi 6,6,1 + vperm 1,2,1,5 + + vxor 0,0,1 + lvx 1,7,5 + addi 7,7,16 + mtctr 6 + +.Loop_enc: + vperm 2,1,2,5 + .long 0x10001508 + lvx 2,7,5 + addi 7,7,16 + vperm 1,2,1,5 + .long 0x10000D08 + lvx 1,7,5 + addi 7,7,16 + bdnz .Loop_enc + + vperm 2,1,2,5 + .long 0x10001508 + lvx 2,7,5 + vperm 1,2,1,5 + .long 0x10000D09 + + vspltisb 2,-1 + vxor 1,1,1 + li 7,15 + vperm 2,2,1,3 + vxor 3,3,4 + lvx 1,0,4 + vperm 0,0,0,3 + vsel 1,1,0,2 + lvx 4,7,4 + stvx 1,0,4 + vsel 0,0,4,2 + stvx 0,7,4 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size aes_hw_encrypt,.-aes_hw_encrypt +.globl aes_hw_decrypt +.type aes_hw_decrypt,@function +.align 5 +aes_hw_decrypt: +.localentry aes_hw_decrypt,0 + + lwz 6,240(5) + lis 0,0xfc00 + li 12,-1 + li 7,15 + or 0,0,0 + + lvx 0,0,3 + neg 11,4 + lvx 1,7,3 + lvsl 2,0,3 + vspltisb 4,0x0f + lvsr 3,0,11 + vxor 2,2,4 + li 7,16 + vperm 0,0,1,2 + lvx 1,0,5 + lvsr 5,0,5 + srwi 6,6,1 + lvx 2,7,5 + addi 7,7,16 + subi 6,6,1 + vperm 1,2,1,5 + + vxor 0,0,1 + lvx 1,7,5 + addi 7,7,16 + mtctr 6 + +.Loop_dec: + vperm 2,1,2,5 + .long 0x10001548 + lvx 2,7,5 + addi 7,7,16 + vperm 1,2,1,5 + .long 0x10000D48 + lvx 1,7,5 + addi 7,7,16 + bdnz .Loop_dec + + vperm 2,1,2,5 + .long 0x10001548 + lvx 2,7,5 + vperm 1,2,1,5 + .long 0x10000D49 + + vspltisb 2,-1 + vxor 1,1,1 + li 7,15 + vperm 2,2,1,3 + vxor 3,3,4 + lvx 1,0,4 + vperm 0,0,0,3 + vsel 1,1,0,2 + lvx 4,7,4 + stvx 1,0,4 + vsel 0,0,4,2 + stvx 0,7,4 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size aes_hw_decrypt,.-aes_hw_decrypt +.globl aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function +.align 5 +aes_hw_cbc_encrypt: +.localentry aes_hw_cbc_encrypt,0 + + cmpldi 5,16 + .long 0x4dc00020 + + cmpwi 8,0 + lis 0,0xffe0 + li 12,-1 + or 0,0,0 + + li 10,15 + vxor 0,0,0 + vspltisb 3,0x0f + + lvx 4,0,7 + lvsl 6,0,7 + lvx 5,10,7 + vxor 6,6,3 + vperm 4,4,5,6 + + neg 11,3 + lvsr 10,0,6 + lwz 9,240(6) + + lvsr 6,0,11 + lvx 5,0,3 + addi 3,3,15 + vxor 6,6,3 + + lvsl 8,0,4 + vspltisb 9,-1 + lvx 7,0,4 + vperm 9,9,0,8 + vxor 8,8,3 + + srwi 9,9,1 + li 10,16 + subi 9,9,1 + beq .Lcbc_dec + +.Lcbc_enc: + vor 2,5,5 + lvx 5,0,3 + addi 3,3,16 + mtctr 9 + subi 5,5,16 + + lvx 0,0,6 + vperm 2,2,5,6 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + vxor 2,2,0 + lvx 0,10,6 + addi 10,10,16 + vxor 2,2,4 + +.Loop_cbc_enc: + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + .long 0x10420508 + lvx 0,10,6 + addi 10,10,16 + bdnz .Loop_cbc_enc + + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + li 10,16 + vperm 0,1,0,10 + .long 0x10820509 + cmpldi 5,16 + + vperm 3,4,4,8 + vsel 2,7,3,9 + vor 7,3,3 + stvx 2,0,4 + addi 4,4,16 + bge .Lcbc_enc + + b .Lcbc_done + +.align 4 +.Lcbc_dec: + cmpldi 5,128 + bge _aesp8_cbc_decrypt8x + vor 3,5,5 + lvx 5,0,3 + addi 3,3,16 + mtctr 9 + subi 5,5,16 + + lvx 0,0,6 + vperm 3,3,5,6 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + vxor 2,3,0 + lvx 0,10,6 + addi 10,10,16 + +.Loop_cbc_dec: + vperm 1,0,1,10 + .long 0x10420D48 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + .long 0x10420548 + lvx 0,10,6 + addi 10,10,16 + bdnz .Loop_cbc_dec + + vperm 1,0,1,10 + .long 0x10420D48 + lvx 1,10,6 + li 10,16 + vperm 0,1,0,10 + .long 0x10420549 + cmpldi 5,16 + + vxor 2,2,4 + vor 4,3,3 + vperm 3,2,2,8 + vsel 2,7,3,9 + vor 7,3,3 + stvx 2,0,4 + addi 4,4,16 + bge .Lcbc_dec + +.Lcbc_done: + addi 4,4,-1 + lvx 2,0,4 + vsel 2,7,2,9 + stvx 2,0,4 + + neg 8,7 + li 10,15 + vxor 0,0,0 + vspltisb 9,-1 + vspltisb 3,0x0f + lvsr 8,0,8 + vperm 9,9,0,8 + vxor 8,8,3 + lvx 7,0,7 + vperm 4,4,4,8 + vsel 2,7,4,9 + lvx 5,10,7 + stvx 2,0,7 + vsel 2,4,5,9 + stvx 2,10,7 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,6,0 +.long 0 +.align 5 +_aesp8_cbc_decrypt8x: + stdu 1,-448(1) + li 10,207 + li 11,223 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,11,1 + addi 11,11,32 + stvx 24,10,1 + addi 10,10,32 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 0,-1 + stw 12,396(1) + li 8,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + subi 5,5,128 + + lvx 23,0,6 + lvx 30,8,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,10 + addi 11,1,79 + mtctr 9 + +.Load_cbc_dec_key: + vperm 24,31,30,10 + lvx 30,8,6 + addi 6,6,0x20 + stvx 24,0,11 + vperm 25,30,31,10 + lvx 31,0,6 + stvx 25,8,11 + addi 11,11,0x20 + bdnz .Load_cbc_dec_key + + lvx 26,8,6 + vperm 24,31,30,10 + lvx 27,26,6 + stvx 24,0,11 + vperm 25,26,31,10 + lvx 28,27,6 + stvx 25,8,11 + addi 11,1,79 + vperm 26,27,26,10 + lvx 29,28,6 + vperm 27,28,27,10 + lvx 30,29,6 + vperm 28,29,28,10 + lvx 31,30,6 + vperm 29,30,29,10 + lvx 14,31,6 + vperm 30,31,30,10 + lvx 24,0,11 + vperm 31,14,31,10 + lvx 25,8,11 + + + + subi 3,3,15 + + li 10,8 + .long 0x7C001E99 + lvsl 6,0,10 + vspltisb 3,0x0f + .long 0x7C281E99 + vxor 6,6,3 + .long 0x7C5A1E99 + vperm 0,0,0,6 + .long 0x7C7B1E99 + vperm 1,1,1,6 + .long 0x7D5C1E99 + vperm 2,2,2,6 + vxor 14,0,23 + .long 0x7D7D1E99 + vperm 3,3,3,6 + vxor 15,1,23 + .long 0x7D9E1E99 + vperm 10,10,10,6 + vxor 16,2,23 + .long 0x7DBF1E99 + addi 3,3,0x80 + vperm 11,11,11,6 + vxor 17,3,23 + vperm 12,12,12,6 + vxor 18,10,23 + vperm 13,13,13,6 + vxor 19,11,23 + vxor 20,12,23 + vxor 21,13,23 + + mtctr 9 + b .Loop_cbc_dec8x +.align 5 +.Loop_cbc_dec8x: + .long 0x11CEC548 + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + lvx 24,26,11 + addi 11,11,0x20 + + .long 0x11CECD48 + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + lvx 25,8,11 + bdnz .Loop_cbc_dec8x + + subic 5,5,128 + .long 0x11CEC548 + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + + subfe. 0,0,0 + .long 0x11CECD48 + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + + and 0,0,5 + .long 0x11CED548 + .long 0x11EFD548 + .long 0x1210D548 + .long 0x1231D548 + .long 0x1252D548 + .long 0x1273D548 + .long 0x1294D548 + .long 0x12B5D548 + + add 3,3,0 + + + + .long 0x11CEDD48 + .long 0x11EFDD48 + .long 0x1210DD48 + .long 0x1231DD48 + .long 0x1252DD48 + .long 0x1273DD48 + .long 0x1294DD48 + .long 0x12B5DD48 + + addi 11,1,79 + .long 0x11CEE548 + .long 0x11EFE548 + .long 0x1210E548 + .long 0x1231E548 + .long 0x1252E548 + .long 0x1273E548 + .long 0x1294E548 + .long 0x12B5E548 + lvx 24,0,11 + + .long 0x11CEED48 + .long 0x11EFED48 + .long 0x1210ED48 + .long 0x1231ED48 + .long 0x1252ED48 + .long 0x1273ED48 + .long 0x1294ED48 + .long 0x12B5ED48 + lvx 25,8,11 + + .long 0x11CEF548 + vxor 4,4,31 + .long 0x11EFF548 + vxor 0,0,31 + .long 0x1210F548 + vxor 1,1,31 + .long 0x1231F548 + vxor 2,2,31 + .long 0x1252F548 + vxor 3,3,31 + .long 0x1273F548 + vxor 10,10,31 + .long 0x1294F548 + vxor 11,11,31 + .long 0x12B5F548 + vxor 12,12,31 + + .long 0x11CE2549 + .long 0x11EF0549 + .long 0x7C001E99 + .long 0x12100D49 + .long 0x7C281E99 + .long 0x12311549 + vperm 0,0,0,6 + .long 0x7C5A1E99 + .long 0x12521D49 + vperm 1,1,1,6 + .long 0x7C7B1E99 + .long 0x12735549 + vperm 2,2,2,6 + .long 0x7D5C1E99 + .long 0x12945D49 + vperm 3,3,3,6 + .long 0x7D7D1E99 + .long 0x12B56549 + vperm 10,10,10,6 + .long 0x7D9E1E99 + vor 4,13,13 + vperm 11,11,11,6 + .long 0x7DBF1E99 + addi 3,3,0x80 + + vperm 14,14,14,6 + vperm 15,15,15,6 + .long 0x7DC02799 + vperm 12,12,12,6 + vxor 14,0,23 + vperm 16,16,16,6 + .long 0x7DE82799 + vperm 13,13,13,6 + vxor 15,1,23 + vperm 17,17,17,6 + .long 0x7E1A2799 + vxor 16,2,23 + vperm 18,18,18,6 + .long 0x7E3B2799 + vxor 17,3,23 + vperm 19,19,19,6 + .long 0x7E5C2799 + vxor 18,10,23 + vperm 20,20,20,6 + .long 0x7E7D2799 + vxor 19,11,23 + vperm 21,21,21,6 + .long 0x7E9E2799 + vxor 20,12,23 + .long 0x7EBF2799 + addi 4,4,0x80 + vxor 21,13,23 + + mtctr 9 + beq .Loop_cbc_dec8x + + addic. 5,5,128 + beq .Lcbc_dec8x_done + nop + nop + +.Loop_cbc_dec8x_tail: + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + lvx 24,26,11 + addi 11,11,0x20 + + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + lvx 25,8,11 + bdnz .Loop_cbc_dec8x_tail + + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + + .long 0x11EFD548 + .long 0x1210D548 + .long 0x1231D548 + .long 0x1252D548 + .long 0x1273D548 + .long 0x1294D548 + .long 0x12B5D548 + + .long 0x11EFDD48 + .long 0x1210DD48 + .long 0x1231DD48 + .long 0x1252DD48 + .long 0x1273DD48 + .long 0x1294DD48 + .long 0x12B5DD48 + + .long 0x11EFE548 + .long 0x1210E548 + .long 0x1231E548 + .long 0x1252E548 + .long 0x1273E548 + .long 0x1294E548 + .long 0x12B5E548 + + .long 0x11EFED48 + .long 0x1210ED48 + .long 0x1231ED48 + .long 0x1252ED48 + .long 0x1273ED48 + .long 0x1294ED48 + .long 0x12B5ED48 + + .long 0x11EFF548 + vxor 4,4,31 + .long 0x1210F548 + vxor 1,1,31 + .long 0x1231F548 + vxor 2,2,31 + .long 0x1252F548 + vxor 3,3,31 + .long 0x1273F548 + vxor 10,10,31 + .long 0x1294F548 + vxor 11,11,31 + .long 0x12B5F548 + vxor 12,12,31 + + cmplwi 5,32 + blt .Lcbc_dec8x_one + nop + beq .Lcbc_dec8x_two + cmplwi 5,64 + blt .Lcbc_dec8x_three + nop + beq .Lcbc_dec8x_four + cmplwi 5,96 + blt .Lcbc_dec8x_five + nop + beq .Lcbc_dec8x_six + +.Lcbc_dec8x_seven: + .long 0x11EF2549 + .long 0x12100D49 + .long 0x12311549 + .long 0x12521D49 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + vperm 21,21,21,6 + .long 0x7E9D2799 + .long 0x7EBE2799 + addi 4,4,0x70 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_six: + .long 0x12102549 + .long 0x12311549 + .long 0x12521D49 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 16,16,16,6 + vperm 17,17,17,6 + .long 0x7E002799 + vperm 18,18,18,6 + .long 0x7E282799 + vperm 19,19,19,6 + .long 0x7E5A2799 + vperm 20,20,20,6 + .long 0x7E7B2799 + vperm 21,21,21,6 + .long 0x7E9C2799 + .long 0x7EBD2799 + addi 4,4,0x60 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_five: + .long 0x12312549 + .long 0x12521D49 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 17,17,17,6 + vperm 18,18,18,6 + .long 0x7E202799 + vperm 19,19,19,6 + .long 0x7E482799 + vperm 20,20,20,6 + .long 0x7E7A2799 + vperm 21,21,21,6 + .long 0x7E9B2799 + .long 0x7EBC2799 + addi 4,4,0x50 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_four: + .long 0x12522549 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 18,18,18,6 + vperm 19,19,19,6 + .long 0x7E402799 + vperm 20,20,20,6 + .long 0x7E682799 + vperm 21,21,21,6 + .long 0x7E9A2799 + .long 0x7EBB2799 + addi 4,4,0x40 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_three: + .long 0x12732549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 19,19,19,6 + vperm 20,20,20,6 + .long 0x7E602799 + vperm 21,21,21,6 + .long 0x7E882799 + .long 0x7EBA2799 + addi 4,4,0x30 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_two: + .long 0x12942549 + .long 0x12B56549 + vor 4,13,13 + + vperm 20,20,20,6 + vperm 21,21,21,6 + .long 0x7E802799 + .long 0x7EA82799 + addi 4,4,0x20 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_one: + .long 0x12B52549 + vor 4,13,13 + + vperm 21,21,21,6 + .long 0x7EA02799 + addi 4,4,0x10 + +.Lcbc_dec8x_done: + vperm 4,4,4,6 + .long 0x7C803F99 + + li 10,79 + li 11,95 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function +.align 5 +aes_hw_ctr32_encrypt_blocks: +.localentry aes_hw_ctr32_encrypt_blocks,0 + + cmpldi 5,1 + .long 0x4dc00020 + + lis 0,0xfff0 + li 12,-1 + or 0,0,0 + + li 10,15 + vxor 0,0,0 + vspltisb 3,0x0f + + lvx 4,0,7 + lvsl 6,0,7 + lvx 5,10,7 + vspltisb 11,1 + vxor 6,6,3 + vperm 4,4,5,6 + vsldoi 11,0,11,1 + + neg 11,3 + lvsr 10,0,6 + lwz 9,240(6) + + lvsr 6,0,11 + lvx 5,0,3 + addi 3,3,15 + vxor 6,6,3 + + srwi 9,9,1 + li 10,16 + subi 9,9,1 + + cmpldi 5,8 + bge _aesp8_ctr32_encrypt8x + + lvsl 8,0,4 + vspltisb 9,-1 + lvx 7,0,4 + vperm 9,9,0,8 + vxor 8,8,3 + + lvx 0,0,6 + mtctr 9 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + vxor 2,4,0 + lvx 0,10,6 + addi 10,10,16 + b .Loop_ctr32_enc + +.align 5 +.Loop_ctr32_enc: + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + .long 0x10420508 + lvx 0,10,6 + addi 10,10,16 + bdnz .Loop_ctr32_enc + + vadduwm 4,4,11 + vor 3,5,5 + lvx 5,0,3 + addi 3,3,16 + subic. 5,5,1 + + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + vperm 3,3,5,6 + li 10,16 + vperm 1,1,0,10 + lvx 0,0,6 + vxor 3,3,1 + .long 0x10421D09 + + lvx 1,10,6 + addi 10,10,16 + vperm 2,2,2,8 + vsel 3,7,2,9 + mtctr 9 + vperm 0,1,0,10 + vor 7,2,2 + vxor 2,4,0 + lvx 0,10,6 + addi 10,10,16 + stvx 3,0,4 + addi 4,4,16 + bne .Loop_ctr32_enc + + addi 4,4,-1 + lvx 2,0,4 + vsel 2,7,2,9 + stvx 2,0,4 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,6,0 +.long 0 +.align 5 +_aesp8_ctr32_encrypt8x: + stdu 1,-448(1) + li 10,207 + li 11,223 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,11,1 + addi 11,11,32 + stvx 24,10,1 + addi 10,10,32 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 0,-1 + stw 12,396(1) + li 8,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + + lvx 23,0,6 + lvx 30,8,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,10 + addi 11,1,79 + mtctr 9 + +.Load_ctr32_enc_key: + vperm 24,31,30,10 + lvx 30,8,6 + addi 6,6,0x20 + stvx 24,0,11 + vperm 25,30,31,10 + lvx 31,0,6 + stvx 25,8,11 + addi 11,11,0x20 + bdnz .Load_ctr32_enc_key + + lvx 26,8,6 + vperm 24,31,30,10 + lvx 27,26,6 + stvx 24,0,11 + vperm 25,26,31,10 + lvx 28,27,6 + stvx 25,8,11 + addi 11,1,79 + vperm 26,27,26,10 + lvx 29,28,6 + vperm 27,28,27,10 + lvx 30,29,6 + vperm 28,29,28,10 + lvx 31,30,6 + vperm 29,30,29,10 + lvx 15,31,6 + vperm 30,31,30,10 + lvx 24,0,11 + vperm 31,15,31,10 + lvx 25,8,11 + + vadduwm 7,11,11 + subi 3,3,15 + sldi 5,5,4 + + vadduwm 16,4,11 + vadduwm 17,4,7 + vxor 15,4,23 + li 10,8 + vadduwm 18,16,7 + vxor 16,16,23 + lvsl 6,0,10 + vadduwm 19,17,7 + vxor 17,17,23 + vspltisb 3,0x0f + vadduwm 20,18,7 + vxor 18,18,23 + vxor 6,6,3 + vadduwm 21,19,7 + vxor 19,19,23 + vadduwm 22,20,7 + vxor 20,20,23 + vadduwm 4,21,7 + vxor 21,21,23 + vxor 22,22,23 + + mtctr 9 + b .Loop_ctr32_enc8x +.align 5 +.Loop_ctr32_enc8x: + .long 0x11EFC508 + .long 0x1210C508 + .long 0x1231C508 + .long 0x1252C508 + .long 0x1273C508 + .long 0x1294C508 + .long 0x12B5C508 + .long 0x12D6C508 +.Loop_ctr32_enc8x_middle: + lvx 24,26,11 + addi 11,11,0x20 + + .long 0x11EFCD08 + .long 0x1210CD08 + .long 0x1231CD08 + .long 0x1252CD08 + .long 0x1273CD08 + .long 0x1294CD08 + .long 0x12B5CD08 + .long 0x12D6CD08 + lvx 25,8,11 + bdnz .Loop_ctr32_enc8x + + subic 11,5,256 + .long 0x11EFC508 + .long 0x1210C508 + .long 0x1231C508 + .long 0x1252C508 + .long 0x1273C508 + .long 0x1294C508 + .long 0x12B5C508 + .long 0x12D6C508 + + subfe 0,0,0 + .long 0x11EFCD08 + .long 0x1210CD08 + .long 0x1231CD08 + .long 0x1252CD08 + .long 0x1273CD08 + .long 0x1294CD08 + .long 0x12B5CD08 + .long 0x12D6CD08 + + and 0,0,11 + addi 11,1,79 + .long 0x11EFD508 + .long 0x1210D508 + .long 0x1231D508 + .long 0x1252D508 + .long 0x1273D508 + .long 0x1294D508 + .long 0x12B5D508 + .long 0x12D6D508 + lvx 24,0,11 + + subic 5,5,129 + .long 0x11EFDD08 + addi 5,5,1 + .long 0x1210DD08 + .long 0x1231DD08 + .long 0x1252DD08 + .long 0x1273DD08 + .long 0x1294DD08 + .long 0x12B5DD08 + .long 0x12D6DD08 + lvx 25,8,11 + + .long 0x11EFE508 + .long 0x7C001E99 + .long 0x1210E508 + .long 0x7C281E99 + .long 0x1231E508 + .long 0x7C5A1E99 + .long 0x1252E508 + .long 0x7C7B1E99 + .long 0x1273E508 + .long 0x7D5C1E99 + .long 0x1294E508 + .long 0x7D9D1E99 + .long 0x12B5E508 + .long 0x7DBE1E99 + .long 0x12D6E508 + .long 0x7DDF1E99 + addi 3,3,0x80 + + .long 0x11EFED08 + vperm 0,0,0,6 + .long 0x1210ED08 + vperm 1,1,1,6 + .long 0x1231ED08 + vperm 2,2,2,6 + .long 0x1252ED08 + vperm 3,3,3,6 + .long 0x1273ED08 + vperm 10,10,10,6 + .long 0x1294ED08 + vperm 12,12,12,6 + .long 0x12B5ED08 + vperm 13,13,13,6 + .long 0x12D6ED08 + vperm 14,14,14,6 + + add 3,3,0 + + + + subfe. 0,0,0 + .long 0x11EFF508 + vxor 0,0,31 + .long 0x1210F508 + vxor 1,1,31 + .long 0x1231F508 + vxor 2,2,31 + .long 0x1252F508 + vxor 3,3,31 + .long 0x1273F508 + vxor 10,10,31 + .long 0x1294F508 + vxor 12,12,31 + .long 0x12B5F508 + vxor 13,13,31 + .long 0x12D6F508 + vxor 14,14,31 + + bne .Lctr32_enc8x_break + + .long 0x100F0509 + .long 0x10300D09 + vadduwm 16,4,11 + .long 0x10511509 + vadduwm 17,4,7 + vxor 15,4,23 + .long 0x10721D09 + vadduwm 18,16,7 + vxor 16,16,23 + .long 0x11535509 + vadduwm 19,17,7 + vxor 17,17,23 + .long 0x11946509 + vadduwm 20,18,7 + vxor 18,18,23 + .long 0x11B56D09 + vadduwm 21,19,7 + vxor 19,19,23 + .long 0x11D67509 + vadduwm 22,20,7 + vxor 20,20,23 + vperm 0,0,0,6 + vadduwm 4,21,7 + vxor 21,21,23 + vperm 1,1,1,6 + vxor 22,22,23 + mtctr 9 + + .long 0x11EFC508 + .long 0x7C002799 + vperm 2,2,2,6 + .long 0x1210C508 + .long 0x7C282799 + vperm 3,3,3,6 + .long 0x1231C508 + .long 0x7C5A2799 + vperm 10,10,10,6 + .long 0x1252C508 + .long 0x7C7B2799 + vperm 12,12,12,6 + .long 0x1273C508 + .long 0x7D5C2799 + vperm 13,13,13,6 + .long 0x1294C508 + .long 0x7D9D2799 + vperm 14,14,14,6 + .long 0x12B5C508 + .long 0x7DBE2799 + .long 0x12D6C508 + .long 0x7DDF2799 + addi 4,4,0x80 + + b .Loop_ctr32_enc8x_middle + +.align 5 +.Lctr32_enc8x_break: + cmpwi 5,-0x60 + blt .Lctr32_enc8x_one + nop + beq .Lctr32_enc8x_two + cmpwi 5,-0x40 + blt .Lctr32_enc8x_three + nop + beq .Lctr32_enc8x_four + cmpwi 5,-0x20 + blt .Lctr32_enc8x_five + nop + beq .Lctr32_enc8x_six + cmpwi 5,0x00 + blt .Lctr32_enc8x_seven + +.Lctr32_enc8x_eight: + .long 0x11EF0509 + .long 0x12100D09 + .long 0x12311509 + .long 0x12521D09 + .long 0x12735509 + .long 0x12946509 + .long 0x12B56D09 + .long 0x12D67509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + vperm 21,21,21,6 + .long 0x7E9D2799 + vperm 22,22,22,6 + .long 0x7EBE2799 + .long 0x7EDF2799 + addi 4,4,0x80 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_seven: + .long 0x11EF0D09 + .long 0x12101509 + .long 0x12311D09 + .long 0x12525509 + .long 0x12736509 + .long 0x12946D09 + .long 0x12B57509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + vperm 21,21,21,6 + .long 0x7E9D2799 + .long 0x7EBE2799 + addi 4,4,0x70 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_six: + .long 0x11EF1509 + .long 0x12101D09 + .long 0x12315509 + .long 0x12526509 + .long 0x12736D09 + .long 0x12947509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + .long 0x7E9D2799 + addi 4,4,0x60 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_five: + .long 0x11EF1D09 + .long 0x12105509 + .long 0x12316509 + .long 0x12526D09 + .long 0x12737509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + .long 0x7E7C2799 + addi 4,4,0x50 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_four: + .long 0x11EF5509 + .long 0x12106509 + .long 0x12316D09 + .long 0x12527509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + .long 0x7E5B2799 + addi 4,4,0x40 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_three: + .long 0x11EF6509 + .long 0x12106D09 + .long 0x12317509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + .long 0x7E3A2799 + addi 4,4,0x30 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_two: + .long 0x11EF6D09 + .long 0x12107509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + .long 0x7E082799 + addi 4,4,0x20 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_one: + .long 0x11EF7509 + + vperm 15,15,15,6 + .long 0x7DE02799 + addi 4,4,0x10 + +.Lctr32_enc8x_done: + li 10,79 + li 11,95 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_xts_encrypt +.type aes_hw_xts_encrypt,@function +.align 5 +aes_hw_xts_encrypt: +.localentry aes_hw_xts_encrypt,0 + + mr 10,3 + li 3,-1 + cmpldi 5,16 + .long 0x4dc00020 + + lis 0,0xfff0 + li 12,-1 + li 11,0 + or 0,0,0 + + vspltisb 9,0x07 + lvsl 6,11,11 + vspltisb 11,0x0f + vxor 6,6,9 + + li 3,15 + lvx 8,0,8 + lvsl 5,0,8 + lvx 4,3,8 + vxor 5,5,11 + vperm 8,8,4,5 + + neg 11,10 + lvsr 5,0,11 + lvx 2,0,10 + addi 10,10,15 + vxor 5,5,11 + + cmpldi 7,0 + beq .Lxts_enc_no_key2 + + lvsr 7,0,7 + lwz 9,240(7) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + lvx 0,0,7 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + vxor 8,8,0 + lvx 0,3,7 + addi 3,3,16 + mtctr 9 + +.Ltweak_xts_enc: + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x11080508 + lvx 0,3,7 + addi 3,3,16 + bdnz .Ltweak_xts_enc + + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + vperm 0,1,0,7 + .long 0x11080509 + + li 8,0 + b .Lxts_enc + +.Lxts_enc_no_key2: + li 3,-16 + and 5,5,3 + + +.Lxts_enc: + lvx 4,0,10 + addi 10,10,16 + + lvsr 7,0,6 + lwz 9,240(6) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + vslb 10,9,9 + vor 10,10,9 + vspltisb 11,1 + vsldoi 10,10,11,15 + + cmpldi 5,96 + bge _aesp8_xts_encrypt6x + + andi. 7,5,15 + subic 0,5,32 + subi 7,7,16 + subfe 0,0,0 + and 0,0,7 + add 10,10,0 + + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + mtctr 9 + b .Loop_xts_enc + +.align 5 +.Loop_xts_enc: + vperm 1,0,1,7 + .long 0x10420D08 + lvx 1,3,6 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x10420508 + lvx 0,3,6 + addi 3,3,16 + bdnz .Loop_xts_enc + + vperm 1,0,1,7 + .long 0x10420D08 + lvx 1,3,6 + li 3,16 + vperm 0,1,0,7 + vxor 0,0,8 + .long 0x10620509 + + vperm 11,3,3,6 + + .long 0x7D602799 + + addi 4,4,16 + + subic. 5,5,16 + beq .Lxts_enc_done + + vor 2,4,4 + lvx 4,0,10 + addi 10,10,16 + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + + subic 0,5,32 + subfe 0,0,0 + and 0,0,7 + add 10,10,0 + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 3,3,0 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + + mtctr 9 + cmpldi 5,16 + bge .Loop_xts_enc + + vxor 3,3,8 + lvsr 5,0,5 + vxor 4,4,4 + vspltisb 11,-1 + vperm 4,4,11,5 + vsel 2,2,3,4 + + subi 11,4,17 + subi 4,4,16 + mtctr 5 + li 5,16 +.Loop_xts_enc_steal: + lbzu 0,1(11) + stb 0,16(11) + bdnz .Loop_xts_enc_steal + + mtctr 9 + b .Loop_xts_enc + +.Lxts_enc_done: + cmpldi 8,0 + beq .Lxts_enc_ret + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_enc_ret: + or 12,12,12 + li 3,0 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt + +.globl aes_hw_xts_decrypt +.type aes_hw_xts_decrypt,@function +.align 5 +aes_hw_xts_decrypt: +.localentry aes_hw_xts_decrypt,0 + + mr 10,3 + li 3,-1 + cmpldi 5,16 + .long 0x4dc00020 + + lis 0,0xfff8 + li 12,-1 + li 11,0 + or 0,0,0 + + andi. 0,5,15 + neg 0,0 + andi. 0,0,16 + sub 5,5,0 + + vspltisb 9,0x07 + lvsl 6,11,11 + vspltisb 11,0x0f + vxor 6,6,9 + + li 3,15 + lvx 8,0,8 + lvsl 5,0,8 + lvx 4,3,8 + vxor 5,5,11 + vperm 8,8,4,5 + + neg 11,10 + lvsr 5,0,11 + lvx 2,0,10 + addi 10,10,15 + vxor 5,5,11 + + cmpldi 7,0 + beq .Lxts_dec_no_key2 + + lvsr 7,0,7 + lwz 9,240(7) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + lvx 0,0,7 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + vxor 8,8,0 + lvx 0,3,7 + addi 3,3,16 + mtctr 9 + +.Ltweak_xts_dec: + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x11080508 + lvx 0,3,7 + addi 3,3,16 + bdnz .Ltweak_xts_dec + + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + vperm 0,1,0,7 + .long 0x11080509 + + li 8,0 + b .Lxts_dec + +.Lxts_dec_no_key2: + neg 3,5 + andi. 3,3,15 + add 5,5,3 + + +.Lxts_dec: + lvx 4,0,10 + addi 10,10,16 + + lvsr 7,0,6 + lwz 9,240(6) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + vslb 10,9,9 + vor 10,10,9 + vspltisb 11,1 + vsldoi 10,10,11,15 + + cmpldi 5,96 + bge _aesp8_xts_decrypt6x + + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + mtctr 9 + + cmpldi 5,16 + blt .Ltail_xts_dec + + +.align 5 +.Loop_xts_dec: + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x10420548 + lvx 0,3,6 + addi 3,3,16 + bdnz .Loop_xts_dec + + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + li 3,16 + vperm 0,1,0,7 + vxor 0,0,8 + .long 0x10620549 + + vperm 11,3,3,6 + + .long 0x7D602799 + + addi 4,4,16 + + subic. 5,5,16 + beq .Lxts_dec_done + + vor 2,4,4 + lvx 4,0,10 + addi 10,10,16 + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + + mtctr 9 + cmpldi 5,16 + bge .Loop_xts_dec + +.Ltail_xts_dec: + vsrab 11,8,9 + vaddubm 12,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 12,12,11 + + subi 10,10,16 + add 10,10,5 + + vxor 2,2,8 + vxor 2,2,12 + +.Loop_xts_dec_short: + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x10420548 + lvx 0,3,6 + addi 3,3,16 + bdnz .Loop_xts_dec_short + + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + li 3,16 + vperm 0,1,0,7 + vxor 0,0,12 + .long 0x10620549 + + vperm 11,3,3,6 + + .long 0x7D602799 + + + vor 2,4,4 + lvx 4,0,10 + + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + vperm 2,2,4,5 + vperm 0,1,0,7 + + lvsr 5,0,5 + vxor 4,4,4 + vspltisb 11,-1 + vperm 4,4,11,5 + vsel 2,2,3,4 + + vxor 0,0,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + + subi 11,4,1 + mtctr 5 + li 5,16 +.Loop_xts_dec_steal: + lbzu 0,1(11) + stb 0,16(11) + bdnz .Loop_xts_dec_steal + + mtctr 9 + b .Loop_xts_dec + +.Lxts_dec_done: + cmpldi 8,0 + beq .Lxts_dec_ret + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_dec_ret: + or 12,12,12 + li 3,0 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt +.align 5 +_aesp8_xts_encrypt6x: + stdu 1,-448(1) + mflr 11 + li 7,207 + li 3,223 + std 11,464(1) + stvx 20,7,1 + addi 7,7,32 + stvx 21,3,1 + addi 3,3,32 + stvx 22,7,1 + addi 7,7,32 + stvx 23,3,1 + addi 3,3,32 + stvx 24,7,1 + addi 7,7,32 + stvx 25,3,1 + addi 3,3,32 + stvx 26,7,1 + addi 7,7,32 + stvx 27,3,1 + addi 3,3,32 + stvx 28,7,1 + addi 7,7,32 + stvx 29,3,1 + addi 3,3,32 + stvx 30,7,1 + stvx 31,3,1 + li 0,-1 + stw 12,396(1) + li 3,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + + lvx 23,0,6 + lvx 30,3,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,7 + addi 7,1,79 + mtctr 9 + +.Load_xts_enc_key: + vperm 24,31,30,7 + lvx 30,3,6 + addi 6,6,0x20 + stvx 24,0,7 + vperm 25,30,31,7 + lvx 31,0,6 + stvx 25,3,7 + addi 7,7,0x20 + bdnz .Load_xts_enc_key + + lvx 26,3,6 + vperm 24,31,30,7 + lvx 27,26,6 + stvx 24,0,7 + vperm 25,26,31,7 + lvx 28,27,6 + stvx 25,3,7 + addi 7,1,79 + vperm 26,27,26,7 + lvx 29,28,6 + vperm 27,28,27,7 + lvx 30,29,6 + vperm 28,29,28,7 + lvx 31,30,6 + vperm 29,30,29,7 + lvx 22,31,6 + vperm 30,31,30,7 + lvx 24,0,7 + vperm 31,22,31,7 + lvx 25,3,7 + + vperm 0,2,4,5 + subi 10,10,31 + vxor 17,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 7,0,17 + vxor 8,8,11 + + .long 0x7C235699 + vxor 18,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 1,1,1,6 + vand 11,11,10 + vxor 12,1,18 + vxor 8,8,11 + + .long 0x7C5A5699 + andi. 31,5,15 + vxor 19,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 2,2,2,6 + vand 11,11,10 + vxor 13,2,19 + vxor 8,8,11 + + .long 0x7C7B5699 + sub 5,5,31 + vxor 20,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 3,3,3,6 + vand 11,11,10 + vxor 14,3,20 + vxor 8,8,11 + + .long 0x7C9C5699 + subi 5,5,0x60 + vxor 21,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 4,4,4,6 + vand 11,11,10 + vxor 15,4,21 + vxor 8,8,11 + + .long 0x7CBD5699 + addi 10,10,0x60 + vxor 22,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 5,5,5,6 + vand 11,11,10 + vxor 16,5,22 + vxor 8,8,11 + + vxor 31,31,23 + mtctr 9 + b .Loop_xts_enc6x + +.align 5 +.Loop_xts_enc6x: + .long 0x10E7C508 + .long 0x118CC508 + .long 0x11ADC508 + .long 0x11CEC508 + .long 0x11EFC508 + .long 0x1210C508 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD08 + .long 0x118CCD08 + .long 0x11ADCD08 + .long 0x11CECD08 + .long 0x11EFCD08 + .long 0x1210CD08 + lvx 25,3,7 + bdnz .Loop_xts_enc6x + + subic 5,5,96 + vxor 0,17,31 + .long 0x10E7C508 + .long 0x118CC508 + vsrab 11,8,9 + vxor 17,8,23 + vaddubm 8,8,8 + .long 0x11ADC508 + .long 0x11CEC508 + vsldoi 11,11,11,15 + .long 0x11EFC508 + .long 0x1210C508 + + subfe. 0,0,0 + vand 11,11,10 + .long 0x10E7CD08 + .long 0x118CCD08 + vxor 8,8,11 + .long 0x11ADCD08 + .long 0x11CECD08 + vxor 1,18,31 + vsrab 11,8,9 + vxor 18,8,23 + .long 0x11EFCD08 + .long 0x1210CD08 + + and 0,0,5 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x10E7D508 + .long 0x118CD508 + vand 11,11,10 + .long 0x11ADD508 + .long 0x11CED508 + vxor 8,8,11 + .long 0x11EFD508 + .long 0x1210D508 + + add 10,10,0 + + + + vxor 2,19,31 + vsrab 11,8,9 + vxor 19,8,23 + vaddubm 8,8,8 + .long 0x10E7DD08 + .long 0x118CDD08 + vsldoi 11,11,11,15 + .long 0x11ADDD08 + .long 0x11CEDD08 + vand 11,11,10 + .long 0x11EFDD08 + .long 0x1210DD08 + + addi 7,1,79 + vxor 8,8,11 + .long 0x10E7E508 + .long 0x118CE508 + vxor 3,20,31 + vsrab 11,8,9 + vxor 20,8,23 + .long 0x11ADE508 + .long 0x11CEE508 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x11EFE508 + .long 0x1210E508 + lvx 24,0,7 + vand 11,11,10 + + .long 0x10E7ED08 + .long 0x118CED08 + vxor 8,8,11 + .long 0x11ADED08 + .long 0x11CEED08 + vxor 4,21,31 + vsrab 11,8,9 + vxor 21,8,23 + .long 0x11EFED08 + .long 0x1210ED08 + lvx 25,3,7 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + + .long 0x10E7F508 + .long 0x118CF508 + vand 11,11,10 + .long 0x11ADF508 + .long 0x11CEF508 + vxor 8,8,11 + .long 0x11EFF508 + .long 0x1210F508 + vxor 5,22,31 + vsrab 11,8,9 + vxor 22,8,23 + + .long 0x10E70509 + .long 0x7C005699 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x118C0D09 + .long 0x7C235699 + .long 0x11AD1509 + vperm 0,0,0,6 + .long 0x7C5A5699 + vand 11,11,10 + .long 0x11CE1D09 + vperm 1,1,1,6 + .long 0x7C7B5699 + .long 0x11EF2509 + vperm 2,2,2,6 + .long 0x7C9C5699 + vxor 8,8,11 + .long 0x11702D09 + + vperm 3,3,3,6 + .long 0x7CBD5699 + addi 10,10,0x60 + vperm 4,4,4,6 + vperm 5,5,5,6 + + vperm 7,7,7,6 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,17 + vperm 13,13,13,6 + .long 0x7D832799 + vxor 12,1,18 + vperm 14,14,14,6 + .long 0x7DBA2799 + vxor 13,2,19 + vperm 15,15,15,6 + .long 0x7DDB2799 + vxor 14,3,20 + vperm 16,11,11,6 + .long 0x7DFC2799 + vxor 15,4,21 + .long 0x7E1D2799 + + vxor 16,5,22 + addi 4,4,0x60 + + mtctr 9 + beq .Loop_xts_enc6x + + addic. 5,5,0x60 + beq .Lxts_enc6x_zero + cmpwi 5,0x20 + blt .Lxts_enc6x_one + nop + beq .Lxts_enc6x_two + cmpwi 5,0x40 + blt .Lxts_enc6x_three + nop + beq .Lxts_enc6x_four + +.Lxts_enc6x_five: + vxor 7,1,17 + vxor 12,2,18 + vxor 13,3,19 + vxor 14,4,20 + vxor 15,5,21 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,22,22 + vperm 12,12,12,6 + .long 0x7CE02799 + vperm 13,13,13,6 + .long 0x7D832799 + vperm 14,14,14,6 + .long 0x7DBA2799 + vxor 11,15,22 + vperm 15,15,15,6 + .long 0x7DDB2799 + .long 0x7DFC2799 + addi 4,4,0x50 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_four: + vxor 7,2,17 + vxor 12,3,18 + vxor 13,4,19 + vxor 14,5,20 + vxor 15,15,15 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,21,21 + vperm 12,12,12,6 + .long 0x7CE02799 + vperm 13,13,13,6 + .long 0x7D832799 + vxor 11,14,21 + vperm 14,14,14,6 + .long 0x7DBA2799 + .long 0x7DDB2799 + addi 4,4,0x40 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_three: + vxor 7,3,17 + vxor 12,4,18 + vxor 13,5,19 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,20,20 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 11,13,20 + vperm 13,13,13,6 + .long 0x7D832799 + .long 0x7DBA2799 + addi 4,4,0x30 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_two: + vxor 7,4,17 + vxor 12,5,18 + vxor 13,13,13 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,19,19 + vxor 11,12,19 + vperm 12,12,12,6 + .long 0x7CE02799 + .long 0x7D832799 + addi 4,4,0x20 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_one: + vxor 7,5,17 + nop +.Loop_xts_enc1x: + .long 0x10E7C508 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD08 + lvx 25,3,7 + bdnz .Loop_xts_enc1x + + add 10,10,31 + cmpwi 31,0 + .long 0x10E7C508 + + subi 10,10,16 + .long 0x10E7CD08 + + lvsr 5,0,31 + .long 0x10E7D508 + + .long 0x7C005699 + .long 0x10E7DD08 + + addi 7,1,79 + .long 0x10E7E508 + lvx 24,0,7 + + .long 0x10E7ED08 + lvx 25,3,7 + vxor 17,17,31 + + vperm 0,0,0,6 + .long 0x10E7F508 + + vperm 0,0,0,5 + .long 0x10E78D09 + + vor 17,18,18 + vxor 11,7,18 + vperm 7,7,7,6 + .long 0x7CE02799 + addi 4,4,0x10 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_zero: + cmpwi 31,0 + beq .Lxts_enc6x_done + + add 10,10,31 + subi 10,10,16 + .long 0x7C005699 + lvsr 5,0,31 + vperm 0,0,0,6 + vperm 0,0,0,5 + vxor 11,11,17 +.Lxts_enc6x_steal: + vxor 0,0,17 + vxor 7,7,7 + vspltisb 12,-1 + vperm 7,7,12,5 + vsel 7,0,11,7 + + subi 30,4,17 + subi 4,4,16 + mtctr 31 +.Loop_xts_enc6x_steal: + lbzu 0,1(30) + stb 0,16(30) + bdnz .Loop_xts_enc6x_steal + + li 31,0 + mtctr 9 + b .Loop_xts_enc1x + +.align 4 +.Lxts_enc6x_done: + cmpldi 8,0 + beq .Lxts_enc6x_ret + + vxor 8,17,23 + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_enc6x_ret: + mtlr 11 + li 10,79 + li 11,95 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,1,0x80,6,6,0 +.long 0 + +.align 5 +_aesp8_xts_enc5x: + .long 0x10E7C508 + .long 0x118CC508 + .long 0x11ADC508 + .long 0x11CEC508 + .long 0x11EFC508 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD08 + .long 0x118CCD08 + .long 0x11ADCD08 + .long 0x11CECD08 + .long 0x11EFCD08 + lvx 25,3,7 + bdnz _aesp8_xts_enc5x + + add 10,10,31 + cmpwi 31,0 + .long 0x10E7C508 + .long 0x118CC508 + .long 0x11ADC508 + .long 0x11CEC508 + .long 0x11EFC508 + + subi 10,10,16 + .long 0x10E7CD08 + .long 0x118CCD08 + .long 0x11ADCD08 + .long 0x11CECD08 + .long 0x11EFCD08 + vxor 17,17,31 + + .long 0x10E7D508 + lvsr 5,0,31 + .long 0x118CD508 + .long 0x11ADD508 + .long 0x11CED508 + .long 0x11EFD508 + vxor 1,18,31 + + .long 0x10E7DD08 + .long 0x7C005699 + .long 0x118CDD08 + .long 0x11ADDD08 + .long 0x11CEDD08 + .long 0x11EFDD08 + vxor 2,19,31 + + addi 7,1,79 + .long 0x10E7E508 + .long 0x118CE508 + .long 0x11ADE508 + .long 0x11CEE508 + .long 0x11EFE508 + lvx 24,0,7 + vxor 3,20,31 + + .long 0x10E7ED08 + vperm 0,0,0,6 + .long 0x118CED08 + .long 0x11ADED08 + .long 0x11CEED08 + .long 0x11EFED08 + lvx 25,3,7 + vxor 4,21,31 + + .long 0x10E7F508 + vperm 0,0,0,5 + .long 0x118CF508 + .long 0x11ADF508 + .long 0x11CEF508 + .long 0x11EFF508 + + .long 0x10E78D09 + .long 0x118C0D09 + .long 0x11AD1509 + .long 0x11CE1D09 + .long 0x11EF2509 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 + +.align 5 +_aesp8_xts_decrypt6x: + stdu 1,-448(1) + mflr 11 + li 7,207 + li 3,223 + std 11,464(1) + stvx 20,7,1 + addi 7,7,32 + stvx 21,3,1 + addi 3,3,32 + stvx 22,7,1 + addi 7,7,32 + stvx 23,3,1 + addi 3,3,32 + stvx 24,7,1 + addi 7,7,32 + stvx 25,3,1 + addi 3,3,32 + stvx 26,7,1 + addi 7,7,32 + stvx 27,3,1 + addi 3,3,32 + stvx 28,7,1 + addi 7,7,32 + stvx 29,3,1 + addi 3,3,32 + stvx 30,7,1 + stvx 31,3,1 + li 0,-1 + stw 12,396(1) + li 3,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + + lvx 23,0,6 + lvx 30,3,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,7 + addi 7,1,79 + mtctr 9 + +.Load_xts_dec_key: + vperm 24,31,30,7 + lvx 30,3,6 + addi 6,6,0x20 + stvx 24,0,7 + vperm 25,30,31,7 + lvx 31,0,6 + stvx 25,3,7 + addi 7,7,0x20 + bdnz .Load_xts_dec_key + + lvx 26,3,6 + vperm 24,31,30,7 + lvx 27,26,6 + stvx 24,0,7 + vperm 25,26,31,7 + lvx 28,27,6 + stvx 25,3,7 + addi 7,1,79 + vperm 26,27,26,7 + lvx 29,28,6 + vperm 27,28,27,7 + lvx 30,29,6 + vperm 28,29,28,7 + lvx 31,30,6 + vperm 29,30,29,7 + lvx 22,31,6 + vperm 30,31,30,7 + lvx 24,0,7 + vperm 31,22,31,7 + lvx 25,3,7 + + vperm 0,2,4,5 + subi 10,10,31 + vxor 17,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 7,0,17 + vxor 8,8,11 + + .long 0x7C235699 + vxor 18,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 1,1,1,6 + vand 11,11,10 + vxor 12,1,18 + vxor 8,8,11 + + .long 0x7C5A5699 + andi. 31,5,15 + vxor 19,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 2,2,2,6 + vand 11,11,10 + vxor 13,2,19 + vxor 8,8,11 + + .long 0x7C7B5699 + sub 5,5,31 + vxor 20,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 3,3,3,6 + vand 11,11,10 + vxor 14,3,20 + vxor 8,8,11 + + .long 0x7C9C5699 + subi 5,5,0x60 + vxor 21,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 4,4,4,6 + vand 11,11,10 + vxor 15,4,21 + vxor 8,8,11 + + .long 0x7CBD5699 + addi 10,10,0x60 + vxor 22,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 5,5,5,6 + vand 11,11,10 + vxor 16,5,22 + vxor 8,8,11 + + vxor 31,31,23 + mtctr 9 + b .Loop_xts_dec6x + +.align 5 +.Loop_xts_dec6x: + .long 0x10E7C548 + .long 0x118CC548 + .long 0x11ADC548 + .long 0x11CEC548 + .long 0x11EFC548 + .long 0x1210C548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + .long 0x118CCD48 + .long 0x11ADCD48 + .long 0x11CECD48 + .long 0x11EFCD48 + .long 0x1210CD48 + lvx 25,3,7 + bdnz .Loop_xts_dec6x + + subic 5,5,96 + vxor 0,17,31 + .long 0x10E7C548 + .long 0x118CC548 + vsrab 11,8,9 + vxor 17,8,23 + vaddubm 8,8,8 + .long 0x11ADC548 + .long 0x11CEC548 + vsldoi 11,11,11,15 + .long 0x11EFC548 + .long 0x1210C548 + + subfe. 0,0,0 + vand 11,11,10 + .long 0x10E7CD48 + .long 0x118CCD48 + vxor 8,8,11 + .long 0x11ADCD48 + .long 0x11CECD48 + vxor 1,18,31 + vsrab 11,8,9 + vxor 18,8,23 + .long 0x11EFCD48 + .long 0x1210CD48 + + and 0,0,5 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x10E7D548 + .long 0x118CD548 + vand 11,11,10 + .long 0x11ADD548 + .long 0x11CED548 + vxor 8,8,11 + .long 0x11EFD548 + .long 0x1210D548 + + add 10,10,0 + + + + vxor 2,19,31 + vsrab 11,8,9 + vxor 19,8,23 + vaddubm 8,8,8 + .long 0x10E7DD48 + .long 0x118CDD48 + vsldoi 11,11,11,15 + .long 0x11ADDD48 + .long 0x11CEDD48 + vand 11,11,10 + .long 0x11EFDD48 + .long 0x1210DD48 + + addi 7,1,79 + vxor 8,8,11 + .long 0x10E7E548 + .long 0x118CE548 + vxor 3,20,31 + vsrab 11,8,9 + vxor 20,8,23 + .long 0x11ADE548 + .long 0x11CEE548 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x11EFE548 + .long 0x1210E548 + lvx 24,0,7 + vand 11,11,10 + + .long 0x10E7ED48 + .long 0x118CED48 + vxor 8,8,11 + .long 0x11ADED48 + .long 0x11CEED48 + vxor 4,21,31 + vsrab 11,8,9 + vxor 21,8,23 + .long 0x11EFED48 + .long 0x1210ED48 + lvx 25,3,7 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + + .long 0x10E7F548 + .long 0x118CF548 + vand 11,11,10 + .long 0x11ADF548 + .long 0x11CEF548 + vxor 8,8,11 + .long 0x11EFF548 + .long 0x1210F548 + vxor 5,22,31 + vsrab 11,8,9 + vxor 22,8,23 + + .long 0x10E70549 + .long 0x7C005699 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x118C0D49 + .long 0x7C235699 + .long 0x11AD1549 + vperm 0,0,0,6 + .long 0x7C5A5699 + vand 11,11,10 + .long 0x11CE1D49 + vperm 1,1,1,6 + .long 0x7C7B5699 + .long 0x11EF2549 + vperm 2,2,2,6 + .long 0x7C9C5699 + vxor 8,8,11 + .long 0x12102D49 + vperm 3,3,3,6 + .long 0x7CBD5699 + addi 10,10,0x60 + vperm 4,4,4,6 + vperm 5,5,5,6 + + vperm 7,7,7,6 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,17 + vperm 13,13,13,6 + .long 0x7D832799 + vxor 12,1,18 + vperm 14,14,14,6 + .long 0x7DBA2799 + vxor 13,2,19 + vperm 15,15,15,6 + .long 0x7DDB2799 + vxor 14,3,20 + vperm 16,16,16,6 + .long 0x7DFC2799 + vxor 15,4,21 + .long 0x7E1D2799 + vxor 16,5,22 + addi 4,4,0x60 + + mtctr 9 + beq .Loop_xts_dec6x + + addic. 5,5,0x60 + beq .Lxts_dec6x_zero + cmpwi 5,0x20 + blt .Lxts_dec6x_one + nop + beq .Lxts_dec6x_two + cmpwi 5,0x40 + blt .Lxts_dec6x_three + nop + beq .Lxts_dec6x_four + +.Lxts_dec6x_five: + vxor 7,1,17 + vxor 12,2,18 + vxor 13,3,19 + vxor 14,4,20 + vxor 15,5,21 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,22,22 + vxor 18,8,23 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,18 + vperm 13,13,13,6 + .long 0x7D832799 + vperm 14,14,14,6 + .long 0x7DBA2799 + vperm 15,15,15,6 + .long 0x7DDB2799 + .long 0x7DFC2799 + addi 4,4,0x50 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_four: + vxor 7,2,17 + vxor 12,3,18 + vxor 13,4,19 + vxor 14,5,20 + vxor 15,15,15 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,21,21 + vor 18,22,22 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,22 + vperm 13,13,13,6 + .long 0x7D832799 + vperm 14,14,14,6 + .long 0x7DBA2799 + .long 0x7DDB2799 + addi 4,4,0x40 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_three: + vxor 7,3,17 + vxor 12,4,18 + vxor 13,5,19 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,20,20 + vor 18,21,21 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,21 + vperm 13,13,13,6 + .long 0x7D832799 + .long 0x7DBA2799 + addi 4,4,0x30 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_two: + vxor 7,4,17 + vxor 12,5,18 + vxor 13,13,13 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,19,19 + vor 18,20,20 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,20 + .long 0x7D832799 + addi 4,4,0x20 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_one: + vxor 7,5,17 + nop +.Loop_xts_dec1x: + .long 0x10E7C548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + lvx 25,3,7 + bdnz .Loop_xts_dec1x + + subi 0,31,1 + .long 0x10E7C548 + + andi. 0,0,16 + cmpwi 31,0 + .long 0x10E7CD48 + + sub 10,10,0 + .long 0x10E7D548 + + .long 0x7C005699 + .long 0x10E7DD48 + + addi 7,1,79 + .long 0x10E7E548 + lvx 24,0,7 + + .long 0x10E7ED48 + lvx 25,3,7 + vxor 17,17,31 + + vperm 0,0,0,6 + .long 0x10E7F548 + + mtctr 9 + .long 0x10E78D49 + + vor 17,18,18 + vor 18,19,19 + vperm 7,7,7,6 + .long 0x7CE02799 + addi 4,4,0x10 + vxor 7,0,19 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_zero: + cmpwi 31,0 + beq .Lxts_dec6x_done + + .long 0x7C005699 + vperm 0,0,0,6 + vxor 7,0,18 +.Lxts_dec6x_steal: + .long 0x10E7C548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + lvx 25,3,7 + bdnz .Lxts_dec6x_steal + + add 10,10,31 + .long 0x10E7C548 + + cmpwi 31,0 + .long 0x10E7CD48 + + .long 0x7C005699 + .long 0x10E7D548 + + lvsr 5,0,31 + .long 0x10E7DD48 + + addi 7,1,79 + .long 0x10E7E548 + lvx 24,0,7 + + .long 0x10E7ED48 + lvx 25,3,7 + vxor 18,18,31 + + vperm 0,0,0,6 + .long 0x10E7F548 + + vperm 0,0,0,5 + .long 0x11679549 + + vperm 7,11,11,6 + .long 0x7CE02799 + + + vxor 7,7,7 + vspltisb 12,-1 + vperm 7,7,12,5 + vsel 7,0,11,7 + vxor 7,7,17 + + subi 30,4,1 + mtctr 31 +.Loop_xts_dec6x_steal: + lbzu 0,1(30) + stb 0,16(30) + bdnz .Loop_xts_dec6x_steal + + li 31,0 + mtctr 9 + b .Loop_xts_dec1x + +.align 4 +.Lxts_dec6x_done: + cmpldi 8,0 + beq .Lxts_dec6x_ret + + vxor 8,17,23 + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_dec6x_ret: + mtlr 11 + li 10,79 + li 11,95 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,1,0x80,6,6,0 +.long 0 + +.align 5 +_aesp8_xts_dec5x: + .long 0x10E7C548 + .long 0x118CC548 + .long 0x11ADC548 + .long 0x11CEC548 + .long 0x11EFC548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + .long 0x118CCD48 + .long 0x11ADCD48 + .long 0x11CECD48 + .long 0x11EFCD48 + lvx 25,3,7 + bdnz _aesp8_xts_dec5x + + subi 0,31,1 + .long 0x10E7C548 + .long 0x118CC548 + .long 0x11ADC548 + .long 0x11CEC548 + .long 0x11EFC548 + + andi. 0,0,16 + cmpwi 31,0 + .long 0x10E7CD48 + .long 0x118CCD48 + .long 0x11ADCD48 + .long 0x11CECD48 + .long 0x11EFCD48 + vxor 17,17,31 + + sub 10,10,0 + .long 0x10E7D548 + .long 0x118CD548 + .long 0x11ADD548 + .long 0x11CED548 + .long 0x11EFD548 + vxor 1,18,31 + + .long 0x10E7DD48 + .long 0x7C005699 + .long 0x118CDD48 + .long 0x11ADDD48 + .long 0x11CEDD48 + .long 0x11EFDD48 + vxor 2,19,31 + + addi 7,1,79 + .long 0x10E7E548 + .long 0x118CE548 + .long 0x11ADE548 + .long 0x11CEE548 + .long 0x11EFE548 + lvx 24,0,7 + vxor 3,20,31 + + .long 0x10E7ED48 + vperm 0,0,0,6 + .long 0x118CED48 + .long 0x11ADED48 + .long 0x11CEED48 + .long 0x11EFED48 + lvx 25,3,7 + vxor 4,21,31 + + .long 0x10E7F548 + .long 0x118CF548 + .long 0x11ADF548 + .long 0x11CEF548 + .long 0x11EFF548 + + .long 0x10E78D49 + .long 0x118C0D49 + .long 0x11AD1549 + .long 0x11CE1D49 + .long 0x11EF2549 + mtctr 9 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +#endif // !OPENSSL_NO_ASM && __powerpc64__ +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S new file mode 100644 index 0000000000..5b909a38d3 --- /dev/null +++ b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S @@ -0,0 +1,587 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) +.machine "any" + +.abiversion 2 +.text + +.globl gcm_init_p8 +.type gcm_init_p8,@function +.align 5 +gcm_init_p8: +.localentry gcm_init_p8,0 + + li 0,-4096 + li 8,0x10 + li 12,-1 + li 9,0x20 + or 0,0,0 + li 10,0x30 + .long 0x7D202699 + + vspltisb 8,-16 + vspltisb 5,1 + vaddubm 8,8,8 + vxor 4,4,4 + vor 8,8,5 + vsldoi 8,8,4,15 + vsldoi 6,4,5,1 + vaddubm 8,8,8 + vspltisb 7,7 + vor 8,8,6 + vspltb 6,9,0 + vsl 9,9,5 + vsrab 6,6,7 + vand 6,6,8 + vxor 3,9,6 + + vsldoi 9,3,3,8 + vsldoi 8,4,8,8 + vsldoi 11,4,9,8 + vsldoi 10,9,4,8 + + .long 0x7D001F99 + .long 0x7D681F99 + li 8,0x40 + .long 0x7D291F99 + li 9,0x50 + .long 0x7D4A1F99 + li 10,0x60 + + .long 0x10035CC8 + .long 0x10234CC8 + .long 0x104354C8 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + vxor 16,0,6 + + vsldoi 17,16,16,8 + vsldoi 19,4,17,8 + vsldoi 18,17,4,8 + + .long 0x7E681F99 + li 8,0x70 + .long 0x7E291F99 + li 9,0x80 + .long 0x7E4A1F99 + li 10,0x90 + .long 0x10039CC8 + .long 0x11B09CC8 + .long 0x10238CC8 + .long 0x11D08CC8 + .long 0x104394C8 + .long 0x11F094C8 + + .long 0x10E044C8 + .long 0x114D44C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vsldoi 11,14,4,8 + vsldoi 9,4,14,8 + vxor 0,0,5 + vxor 2,2,6 + vxor 13,13,11 + vxor 15,15,9 + + vsldoi 0,0,0,8 + vsldoi 13,13,13,8 + vxor 0,0,7 + vxor 13,13,10 + + vsldoi 6,0,0,8 + vsldoi 9,13,13,8 + .long 0x100044C8 + .long 0x11AD44C8 + vxor 6,6,2 + vxor 9,9,15 + vxor 0,0,6 + vxor 13,13,9 + + vsldoi 9,0,0,8 + vsldoi 17,13,13,8 + vsldoi 11,4,9,8 + vsldoi 10,9,4,8 + vsldoi 19,4,17,8 + vsldoi 18,17,4,8 + + .long 0x7D681F99 + li 8,0xa0 + .long 0x7D291F99 + li 9,0xb0 + .long 0x7D4A1F99 + li 10,0xc0 + .long 0x7E681F99 + .long 0x7E291F99 + .long 0x7E4A1F99 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size gcm_init_p8,.-gcm_init_p8 +.globl gcm_gmult_p8 +.type gcm_gmult_p8,@function +.align 5 +gcm_gmult_p8: +.localentry gcm_gmult_p8,0 + + lis 0,0xfff8 + li 8,0x10 + li 12,-1 + li 9,0x20 + or 0,0,0 + li 10,0x30 + .long 0x7C601E99 + + .long 0x7D682699 + lvsl 12,0,0 + .long 0x7D292699 + vspltisb 5,0x07 + .long 0x7D4A2699 + vxor 12,12,5 + .long 0x7D002699 + vperm 3,3,3,12 + vxor 4,4,4 + + .long 0x10035CC8 + .long 0x10234CC8 + .long 0x104354C8 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + vxor 0,0,6 + + vperm 0,0,0,12 + .long 0x7C001F99 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size gcm_gmult_p8,.-gcm_gmult_p8 + +.globl gcm_ghash_p8 +.type gcm_ghash_p8,@function +.align 5 +gcm_ghash_p8: +.localentry gcm_ghash_p8,0 + + li 0,-4096 + li 8,0x10 + li 12,-1 + li 9,0x20 + or 0,0,0 + li 10,0x30 + .long 0x7C001E99 + + .long 0x7D682699 + li 8,0x40 + lvsl 12,0,0 + .long 0x7D292699 + li 9,0x50 + vspltisb 5,0x07 + .long 0x7D4A2699 + li 10,0x60 + vxor 12,12,5 + .long 0x7D002699 + vperm 0,0,0,12 + vxor 4,4,4 + + cmpldi 6,64 + bge .Lgcm_ghash_p8_4x + + .long 0x7C602E99 + addi 5,5,16 + subic. 6,6,16 + vperm 3,3,3,12 + vxor 3,3,0 + beq .Lshort + + .long 0x7E682699 + li 8,16 + .long 0x7E292699 + add 9,5,6 + .long 0x7E4A2699 + + +.align 5 +.Loop_2x: + .long 0x7E002E99 + vperm 16,16,16,12 + + subic 6,6,32 + .long 0x10039CC8 + .long 0x11B05CC8 + subfe 0,0,0 + .long 0x10238CC8 + .long 0x11D04CC8 + and 0,0,6 + .long 0x104394C8 + .long 0x11F054C8 + add 5,5,0 + + vxor 0,0,13 + vxor 1,1,14 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 2,2,15 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + .long 0x7C682E99 + addi 5,5,32 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vperm 3,3,3,12 + vxor 6,6,2 + vxor 3,3,6 + vxor 3,3,0 + cmpld 9,5 + bgt .Loop_2x + + cmplwi 6,0 + bne .Leven + +.Lshort: + .long 0x10035CC8 + .long 0x10234CC8 + .long 0x104354C8 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + +.Leven: + vxor 0,0,6 + vperm 0,0,0,12 + .long 0x7C001F99 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.align 5 +.gcm_ghash_p8_4x: +.Lgcm_ghash_p8_4x: + stdu 1,-256(1) + li 10,63 + li 11,79 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,11,1 + addi 11,11,32 + stvx 24,10,1 + addi 10,10,32 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + li 10,0x60 + stvx 31,11,1 + li 0,-1 + stw 12,252(1) + or 0,0,0 + + lvsl 5,0,8 + + li 8,0x70 + .long 0x7E292699 + li 9,0x80 + vspltisb 6,8 + + li 10,0x90 + .long 0x7EE82699 + li 8,0xa0 + .long 0x7F092699 + li 9,0xb0 + .long 0x7F2A2699 + li 10,0xc0 + .long 0x7FA82699 + li 8,0x10 + .long 0x7FC92699 + li 9,0x20 + .long 0x7FEA2699 + li 10,0x30 + + vsldoi 7,4,6,8 + vaddubm 18,5,7 + vaddubm 19,6,18 + + srdi 6,6,4 + + .long 0x7C602E99 + .long 0x7E082E99 + subic. 6,6,8 + .long 0x7EC92E99 + .long 0x7F8A2E99 + addi 5,5,0x40 + vperm 3,3,3,12 + vperm 16,16,16,12 + vperm 22,22,22,12 + vperm 28,28,28,12 + + vxor 2,3,0 + + .long 0x11B0BCC8 + .long 0x11D0C4C8 + .long 0x11F0CCC8 + + vperm 11,17,9,18 + vperm 5,22,28,19 + vperm 10,17,9,19 + vperm 6,22,28,18 + .long 0x12B68CC8 + .long 0x12855CC8 + .long 0x137C4CC8 + .long 0x134654C8 + + vxor 21,21,14 + vxor 20,20,13 + vxor 27,27,21 + vxor 26,26,15 + + blt .Ltail_4x + +.Loop_4x: + .long 0x7C602E99 + .long 0x7E082E99 + subic. 6,6,4 + .long 0x7EC92E99 + .long 0x7F8A2E99 + addi 5,5,0x40 + vperm 16,16,16,12 + vperm 22,22,22,12 + vperm 28,28,28,12 + vperm 3,3,3,12 + + .long 0x1002ECC8 + .long 0x1022F4C8 + .long 0x1042FCC8 + .long 0x11B0BCC8 + .long 0x11D0C4C8 + .long 0x11F0CCC8 + + vxor 0,0,20 + vxor 1,1,27 + vxor 2,2,26 + vperm 5,22,28,19 + vperm 6,22,28,18 + + .long 0x10E044C8 + .long 0x12855CC8 + .long 0x134654C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x12B68CC8 + .long 0x137C4CC8 + .long 0x100044C8 + + vxor 20,20,13 + vxor 26,26,15 + vxor 2,2,3 + vxor 21,21,14 + vxor 2,2,6 + vxor 27,27,21 + vxor 2,2,0 + bge .Loop_4x + +.Ltail_4x: + .long 0x1002ECC8 + .long 0x1022F4C8 + .long 0x1042FCC8 + + vxor 0,0,20 + vxor 1,1,27 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 2,2,26 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + vxor 0,0,6 + + addic. 6,6,4 + beq .Ldone_4x + + .long 0x7C602E99 + cmpldi 6,2 + li 6,-4 + blt .Lone + .long 0x7E082E99 + beq .Ltwo + +.Lthree: + .long 0x7EC92E99 + vperm 3,3,3,12 + vperm 16,16,16,12 + vperm 22,22,22,12 + + vxor 2,3,0 + vor 29,23,23 + vor 30, 24, 24 + vor 31,25,25 + + vperm 5,16,22,19 + vperm 6,16,22,18 + .long 0x12B08CC8 + .long 0x13764CC8 + .long 0x12855CC8 + .long 0x134654C8 + + vxor 27,27,21 + b .Ltail_4x + +.align 4 +.Ltwo: + vperm 3,3,3,12 + vperm 16,16,16,12 + + vxor 2,3,0 + vperm 5,4,16,19 + vperm 6,4,16,18 + + vsldoi 29,4,17,8 + vor 30, 17, 17 + vsldoi 31,17,4,8 + + .long 0x12855CC8 + .long 0x13704CC8 + .long 0x134654C8 + + b .Ltail_4x + +.align 4 +.Lone: + vperm 3,3,3,12 + + vsldoi 29,4,9,8 + vor 30, 9, 9 + vsldoi 31,9,4,8 + + vxor 2,3,0 + vxor 20,20,20 + vxor 27,27,27 + vxor 26,26,26 + + b .Ltail_4x + +.Ldone_4x: + vperm 0,0,0,12 + .long 0x7C001F99 + + li 10,63 + li 11,79 + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + addi 1,1,256 + blr +.long 0 +.byte 0,12,0x04,0,0x80,0,4,0 +.long 0 +.size gcm_ghash_p8,.-gcm_ghash_p8 + +.byte 71,72,65,83,72,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && __powerpc64__ +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S index 519081bb98..feceb5d9f8 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 @@ -966,3 +972,4 @@ ChaCha20_ssse3: .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S index 319ed627f5..e43cfea942 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .hidden _x86_AES_encrypt_compact .type _x86_AES_encrypt_compact,@function @@ -979,12 +985,12 @@ _x86_AES_encrypt: .long 27,54,0,0 .long 0,0,0,0 .size _x86_AES_encrypt,.-_x86_AES_encrypt -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,@function +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,@function .align 16 -asm_AES_encrypt: -.L_asm_AES_encrypt_begin: +aes_nohw_encrypt: +.L_aes_nohw_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1044,7 +1050,7 @@ asm_AES_encrypt: popl %ebx popl %ebp ret -.size asm_AES_encrypt,.-.L_asm_AES_encrypt_begin +.size aes_nohw_encrypt,.-.L_aes_nohw_encrypt_begin .hidden _x86_AES_decrypt_compact .type _x86_AES_decrypt_compact,@function .align 16 @@ -2175,12 +2181,12 @@ _x86_AES_decrypt: .byte 23,43,4,126,186,119,214,38 .byte 225,105,20,99,85,33,12,125 .size _x86_AES_decrypt,.-_x86_AES_decrypt -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,@function +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,@function .align 16 -asm_AES_decrypt: -.L_asm_AES_decrypt_begin: +aes_nohw_decrypt: +.L_aes_nohw_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2240,13 +2246,13 @@ asm_AES_decrypt: popl %ebx popl %ebp ret -.size asm_AES_decrypt,.-.L_asm_AES_decrypt_begin -.globl asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.type asm_AES_cbc_encrypt,@function +.size aes_nohw_decrypt,.-.L_aes_nohw_decrypt_begin +.globl aes_nohw_cbc_encrypt +.hidden aes_nohw_cbc_encrypt +.type aes_nohw_cbc_encrypt,@function .align 16 -asm_AES_cbc_encrypt: -.L_asm_AES_cbc_encrypt_begin: +aes_nohw_cbc_encrypt: +.L_aes_nohw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2774,7 +2780,7 @@ asm_AES_cbc_encrypt: popl %ebx popl %ebp ret -.size asm_AES_cbc_encrypt,.-.L_asm_AES_cbc_encrypt_begin +.size aes_nohw_cbc_encrypt,.-.L_aes_nohw_cbc_encrypt_begin .hidden _x86_AES_set_encrypt_key .type _x86_AES_set_encrypt_key,@function .align 16 @@ -3006,21 +3012,21 @@ _x86_AES_set_encrypt_key: popl %ebp ret .size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,@function +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,@function .align 16 -asm_AES_set_encrypt_key: -.L_asm_AES_set_encrypt_key_begin: +aes_nohw_set_encrypt_key: +.L_aes_nohw_set_encrypt_key_begin: call _x86_AES_set_encrypt_key ret -.size asm_AES_set_encrypt_key,.-.L_asm_AES_set_encrypt_key_begin -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,@function +.size aes_nohw_set_encrypt_key,.-.L_aes_nohw_set_encrypt_key_begin +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,@function .align 16 -asm_AES_set_decrypt_key: -.L_asm_AES_set_decrypt_key_begin: +aes_nohw_set_decrypt_key: +.L_aes_nohw_set_decrypt_key_begin: call _x86_AES_set_encrypt_key cmpl $0,%eax je .L054proceed @@ -3249,8 +3255,9 @@ asm_AES_set_decrypt_key: popl %ebx popl %ebp ret -.size asm_AES_set_decrypt_key,.-.L_asm_AES_set_decrypt_key_begin +.size aes_nohw_set_decrypt_key,.-.L_aes_nohw_set_decrypt_key_begin .byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89 .byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 .byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S index cc53fa46df..a418869701 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S @@ -1,11 +1,31 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text -.globl aesni_encrypt -.hidden aesni_encrypt -.type aesni_encrypt,@function +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function .align 16 -aesni_encrypt: -.L_aesni_encrypt_begin: +aes_hw_encrypt: +.L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L000pic +.L000pic: + popl %ebx + leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -15,25 +35,25 @@ aesni_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L000enc1_loop_1: +.L001enc1_loop_1: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L000enc1_loop_1 + jnz .L001enc1_loop_1 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movups %xmm2,(%eax) pxor %xmm2,%xmm2 ret -.size aesni_encrypt,.-.L_aesni_encrypt_begin -.globl aesni_decrypt -.hidden aesni_decrypt -.type aesni_decrypt,@function +.size aes_hw_encrypt,.-.L_aes_hw_encrypt_begin +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function .align 16 -aesni_decrypt: -.L_aesni_decrypt_begin: +aes_hw_decrypt: +.L_aes_hw_decrypt_begin: movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -43,19 +63,19 @@ aesni_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L001dec1_loop_2: +.L002dec1_loop_2: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L001dec1_loop_2 + jnz .L002dec1_loop_2 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movups %xmm2,(%eax) pxor %xmm2,%xmm2 ret -.size aesni_decrypt,.-.L_aesni_decrypt_begin +.size aes_hw_decrypt,.-.L_aes_hw_decrypt_begin .hidden _aesni_encrypt2 .type _aesni_encrypt2,@function .align 16 @@ -69,7 +89,7 @@ _aesni_encrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L002enc2_loop: +.L003enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -77,7 +97,7 @@ _aesni_encrypt2: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L002enc2_loop + jnz .L003enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 @@ -97,7 +117,7 @@ _aesni_decrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L003dec2_loop: +.L004dec2_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 movups (%edx,%ecx,1),%xmm1 @@ -105,7 +125,7 @@ _aesni_decrypt2: .byte 102,15,56,222,208 .byte 102,15,56,222,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L003dec2_loop + jnz .L004dec2_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,223,208 @@ -126,7 +146,7 @@ _aesni_encrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L004enc3_loop: +.L005enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -136,7 +156,7 @@ _aesni_encrypt3: .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%edx,%ecx,1),%xmm0 - jnz .L004enc3_loop + jnz .L005enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -159,7 +179,7 @@ _aesni_decrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L005dec3_loop: +.L006dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -169,7 +189,7 @@ _aesni_decrypt3: .byte 102,15,56,222,216 .byte 102,15,56,222,224 movups -16(%edx,%ecx,1),%xmm0 - jnz .L005dec3_loop + jnz .L006dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -194,7 +214,7 @@ _aesni_encrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -.L006enc4_loop: +.L007enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -206,7 +226,7 @@ _aesni_encrypt4: .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%edx,%ecx,1),%xmm0 - jnz .L006enc4_loop + jnz .L007enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -233,7 +253,7 @@ _aesni_decrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -.L007dec4_loop: +.L008dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -245,7 +265,7 @@ _aesni_decrypt4: .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups -16(%edx,%ecx,1),%xmm0 - jnz .L007dec4_loop + jnz .L008dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -276,13 +296,13 @@ _aesni_encrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp .L008_aesni_encrypt6_inner + jmp .L009_aesni_encrypt6_inner .align 16 -.L009enc6_loop: +.L010enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 -.L008_aesni_encrypt6_inner: +.L009_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -296,7 +316,7 @@ _aesni_encrypt6: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L009enc6_loop + jnz .L010enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -331,13 +351,13 @@ _aesni_decrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp .L010_aesni_decrypt6_inner + jmp .L011_aesni_decrypt6_inner .align 16 -.L011dec6_loop: +.L012dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 -.L010_aesni_decrypt6_inner: +.L011_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -351,7 +371,7 @@ _aesni_decrypt6: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L011dec6_loop + jnz .L012dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -366,12 +386,12 @@ _aesni_decrypt6: .byte 102,15,56,223,248 ret .size _aesni_decrypt6,.-_aesni_decrypt6 -.globl aesni_ecb_encrypt -.hidden aesni_ecb_encrypt -.type aesni_ecb_encrypt,@function +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function .align 16 -aesni_ecb_encrypt: -.L_aesni_ecb_encrypt_begin: +aes_hw_ecb_encrypt: +.L_aes_hw_ecb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -382,14 +402,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L012ecb_ret + jz .L013ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L013ecb_decrypt + jz .L014ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L014ecb_enc_tail + jb .L015ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -398,9 +418,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L015ecb_enc_loop6_enter + jmp .L016ecb_enc_loop6_enter .align 16 -.L016ecb_enc_loop6: +.L017ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -415,12 +435,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L015ecb_enc_loop6_enter: +.L016ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L016ecb_enc_loop6 + jnc .L017ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -429,18 +449,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L012ecb_ret -.L014ecb_enc_tail: + jz .L013ecb_ret +.L015ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L017ecb_enc_one + jb .L018ecb_enc_one movups 16(%esi),%xmm3 - je .L018ecb_enc_two + je .L019ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L019ecb_enc_three + jb .L020ecb_enc_three movups 48(%esi),%xmm5 - je .L020ecb_enc_four + je .L021ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -449,49 +469,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L017ecb_enc_one: +.L018ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L021enc1_loop_3: +.L022enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L021enc1_loop_3 + jnz .L022enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L018ecb_enc_two: +.L019ecb_enc_two: call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L019ecb_enc_three: +.L020ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L020ecb_enc_four: +.L021ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L013ecb_decrypt: +.L014ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L022ecb_dec_tail + jb .L023ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -500,9 +520,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L023ecb_dec_loop6_enter + jmp .L024ecb_dec_loop6_enter .align 16 -.L024ecb_dec_loop6: +.L025ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -517,12 +537,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L023ecb_dec_loop6_enter: +.L024ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L024ecb_dec_loop6 + jnc .L025ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -531,18 +551,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L012ecb_ret -.L022ecb_dec_tail: + jz .L013ecb_ret +.L023ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L025ecb_dec_one + jb .L026ecb_dec_one movups 16(%esi),%xmm3 - je .L026ecb_dec_two + je .L027ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L027ecb_dec_three + jb .L028ecb_dec_three movups 48(%esi),%xmm5 - je .L028ecb_dec_four + je .L029ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -551,43 +571,43 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L025ecb_dec_one: +.L026ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L029dec1_loop_4: +.L030dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L029dec1_loop_4 + jnz .L030dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L026ecb_dec_two: +.L027ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L027ecb_dec_three: +.L028ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L028ecb_dec_four: +.L029ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L012ecb_ret: +.L013ecb_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -601,13 +621,13 @@ aesni_ecb_encrypt: popl %ebx popl %ebp ret -.size aesni_ecb_encrypt,.-.L_aesni_ecb_encrypt_begin -.globl aesni_ccm64_encrypt_blocks -.hidden aesni_ccm64_encrypt_blocks -.type aesni_ccm64_encrypt_blocks,@function +.size aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin +.globl aes_hw_ccm64_encrypt_blocks +.hidden aes_hw_ccm64_encrypt_blocks +.type aes_hw_ccm64_encrypt_blocks,@function .align 16 -aesni_ccm64_encrypt_blocks: -.L_aesni_ccm64_encrypt_blocks_begin: +aes_hw_ccm64_encrypt_blocks: +.L_aes_hw_ccm64_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -643,7 +663,7 @@ aesni_ccm64_encrypt_blocks: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -.L030ccm64_enc_outer: +.L031ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -652,7 +672,7 @@ aesni_ccm64_encrypt_blocks: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -.L031ccm64_enc2_loop: +.L032ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -660,7 +680,7 @@ aesni_ccm64_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L031ccm64_enc2_loop + jnz .L032ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -673,7 +693,7 @@ aesni_ccm64_encrypt_blocks: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz .L030ccm64_enc_outer + jnz .L031ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -690,13 +710,13 @@ aesni_ccm64_encrypt_blocks: popl %ebx popl %ebp ret -.size aesni_ccm64_encrypt_blocks,.-.L_aesni_ccm64_encrypt_blocks_begin -.globl aesni_ccm64_decrypt_blocks -.hidden aesni_ccm64_decrypt_blocks -.type aesni_ccm64_decrypt_blocks,@function +.size aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin +.globl aes_hw_ccm64_decrypt_blocks +.hidden aes_hw_ccm64_decrypt_blocks +.type aes_hw_ccm64_decrypt_blocks,@function .align 16 -aesni_ccm64_decrypt_blocks: -.L_aesni_ccm64_decrypt_blocks_begin: +aes_hw_ccm64_decrypt_blocks: +.L_aes_hw_ccm64_decrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -733,12 +753,12 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L032enc1_loop_5: +.L033enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L032enc1_loop_5 + jnz .L033enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -748,16 +768,16 @@ aesni_ccm64_decrypt_blocks: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp .L033ccm64_dec_outer + jmp .L034ccm64_dec_outer .align 16 -.L033ccm64_dec_outer: +.L034ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L034ccm64_dec_break + jz .L035ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -765,7 +785,7 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -.L035ccm64_dec2_loop: +.L036ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -773,7 +793,7 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L035ccm64_dec2_loop + jnz .L036ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -781,9 +801,9 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp .L033ccm64_dec_outer + jmp .L034ccm64_dec_outer .align 16 -.L034ccm64_dec_break: +.L035ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -791,12 +811,12 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L036enc1_loop_6: +.L037enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L036enc1_loop_6 + jnz .L037enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -814,17 +834,29 @@ aesni_ccm64_decrypt_blocks: popl %ebx popl %ebp ret -.size aesni_ccm64_decrypt_blocks,.-.L_aesni_ccm64_decrypt_blocks_begin -.globl aesni_ctr32_encrypt_blocks -.hidden aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,@function +.size aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function .align 16 -aesni_ctr32_encrypt_blocks: -.L_aesni_ctr32_encrypt_blocks_begin: +aes_hw_ctr32_encrypt_blocks: +.L_aes_hw_ctr32_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L038pic +.L038pic: + popl %ebx + leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax @@ -835,7 +867,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L037ctr32_one_shortcut + je .L039ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -873,7 +905,7 @@ aesni_ctr32_encrypt_blocks: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L038ctr32_tail + jb .L040ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -882,9 +914,9 @@ aesni_ctr32_encrypt_blocks: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L039ctr32_loop6 + jmp .L041ctr32_loop6 .align 16 -.L039ctr32_loop6: +.L041ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -938,27 +970,27 @@ aesni_ctr32_encrypt_blocks: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L039ctr32_loop6 + jnc .L041ctr32_loop6 addl $6,%eax - jz .L040ctr32_ret + jz .L042ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -.L038ctr32_tail: +.L040ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L041ctr32_one + jb .L043ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L042ctr32_two + je .L044ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L043ctr32_three + jb .L045ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L044ctr32_four + je .L046ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -976,29 +1008,29 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L037ctr32_one_shortcut: +.L039ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L041ctr32_one: +.L043ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L045enc1_loop_7: +.L047enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L045enc1_loop_7 + jnz .L047enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L042ctr32_two: +.L044ctr32_two: call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -1006,9 +1038,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L043ctr32_three: +.L045ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -1019,9 +1051,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L044ctr32_four: +.L046ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -1035,7 +1067,7 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L040ctr32_ret: +.L042ctr32_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1053,13 +1085,13 @@ aesni_ctr32_encrypt_blocks: popl %ebx popl %ebp ret -.size aesni_ctr32_encrypt_blocks,.-.L_aesni_ctr32_encrypt_blocks_begin -.globl aesni_xts_encrypt -.hidden aesni_xts_encrypt -.type aesni_xts_encrypt,@function +.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin +.globl aes_hw_xts_encrypt +.hidden aes_hw_xts_encrypt +.type aes_hw_xts_encrypt,@function .align 16 -aesni_xts_encrypt: -.L_aesni_xts_encrypt_begin: +aes_hw_xts_encrypt: +.L_aes_hw_xts_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1072,12 +1104,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L046enc1_loop_8: +.L048enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L046enc1_loop_8 + jnz .L048enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1101,14 +1133,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L047xts_enc_short + jc .L049xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L048xts_enc_loop6 + jmp .L050xts_enc_loop6 .align 16 -.L048xts_enc_loop6: +.L050xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1197,23 +1229,23 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L048xts_enc_loop6 + jnc .L050xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L047xts_enc_short: +.L049xts_enc_short: addl $96,%eax - jz .L049xts_enc_done6x + jz .L051xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L050xts_enc_one + jb .L052xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L051xts_enc_two + je .L053xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1222,7 +1254,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L052xts_enc_three + jb .L054xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1232,7 +1264,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L053xts_enc_four + je .L055xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1264,9 +1296,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L050xts_enc_one: +.L052xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1274,20 +1306,20 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L055enc1_loop_9: +.L057enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L055enc1_loop_9 + jnz .L057enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L051xts_enc_two: +.L053xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1301,9 +1333,9 @@ aesni_xts_encrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L052xts_enc_three: +.L054xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1321,9 +1353,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L053xts_enc_four: +.L055xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1345,28 +1377,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L049xts_enc_done6x: +.L051xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L056xts_enc_ret + jz .L058xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L057xts_enc_steal + jmp .L059xts_enc_steal .align 16 -.L054xts_enc_done: +.L056xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L056xts_enc_ret + jz .L058xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L057xts_enc_steal: +.L059xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1374,7 +1406,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L057xts_enc_steal + jnz .L059xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1384,16 +1416,16 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L058enc1_loop_10: +.L060enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L058enc1_loop_10 + jnz .L060enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L056xts_enc_ret: +.L058xts_enc_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1414,13 +1446,13 @@ aesni_xts_encrypt: popl %ebx popl %ebp ret -.size aesni_xts_encrypt,.-.L_aesni_xts_encrypt_begin -.globl aesni_xts_decrypt -.hidden aesni_xts_decrypt -.type aesni_xts_decrypt,@function +.size aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin +.globl aes_hw_xts_decrypt +.hidden aes_hw_xts_decrypt +.type aes_hw_xts_decrypt,@function .align 16 -aesni_xts_decrypt: -.L_aesni_xts_decrypt_begin: +aes_hw_xts_decrypt: +.L_aes_hw_xts_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1433,12 +1465,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L059enc1_loop_11: +.L061enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L059enc1_loop_11 + jnz .L061enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1467,14 +1499,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L060xts_dec_short + jc .L062xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L061xts_dec_loop6 + jmp .L063xts_dec_loop6 .align 16 -.L061xts_dec_loop6: +.L063xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1563,23 +1595,23 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L061xts_dec_loop6 + jnc .L063xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L060xts_dec_short: +.L062xts_dec_short: addl $96,%eax - jz .L062xts_dec_done6x + jz .L064xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L063xts_dec_one + jb .L065xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L064xts_dec_two + je .L066xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1588,7 +1620,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L065xts_dec_three + jb .L067xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1598,7 +1630,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L066xts_dec_four + je .L068xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1630,9 +1662,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L063xts_dec_one: +.L065xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1640,20 +1672,20 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L068dec1_loop_12: +.L070dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L068dec1_loop_12 + jnz .L070dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L064xts_dec_two: +.L066xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1667,9 +1699,9 @@ aesni_xts_decrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L065xts_dec_three: +.L067xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1687,9 +1719,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L066xts_dec_four: +.L068xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1711,20 +1743,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L062xts_dec_done6x: +.L064xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L069xts_dec_ret + jz .L071xts_dec_ret movl %eax,112(%esp) - jmp .L070xts_dec_only_one_more + jmp .L072xts_dec_only_one_more .align 16 -.L067xts_dec_done: +.L069xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L069xts_dec_ret + jz .L071xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1734,7 +1766,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L070xts_dec_only_one_more: +.L072xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1748,16 +1780,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L071dec1_loop_13: +.L073dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L071dec1_loop_13 + jnz .L073dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L072xts_dec_steal: +.L074xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1765,7 +1797,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L072xts_dec_steal + jnz .L074xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1775,16 +1807,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L073dec1_loop_14: +.L075dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L073dec1_loop_14 + jnz .L075dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L069xts_dec_ret: +.L071xts_dec_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1805,13 +1837,13 @@ aesni_xts_decrypt: popl %ebx popl %ebp ret -.size aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin -.globl aesni_cbc_encrypt -.hidden aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function +.size aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function .align 16 -aesni_cbc_encrypt: -.L_aesni_cbc_encrypt_begin: +aes_hw_cbc_encrypt: +.L_aes_hw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1825,7 +1857,7 @@ aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L074cbc_abort + jz .L076cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1833,14 +1865,14 @@ aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L075cbc_decrypt + je .L077cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L076cbc_enc_tail + jb .L078cbc_enc_tail subl $16,%eax - jmp .L077cbc_enc_loop + jmp .L079cbc_enc_loop .align 16 -.L077cbc_enc_loop: +.L079cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1848,25 +1880,25 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L078enc1_loop_15: +.L080enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L078enc1_loop_15 + jnz .L080enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L077cbc_enc_loop + jnc .L079cbc_enc_loop addl $16,%eax - jnz .L076cbc_enc_tail + jnz .L078cbc_enc_tail movaps %xmm2,%xmm7 pxor %xmm2,%xmm2 - jmp .L079cbc_ret -.L076cbc_enc_tail: + jmp .L081cbc_ret +.L078cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1877,20 +1909,20 @@ aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L077cbc_enc_loop + jmp .L079cbc_enc_loop .align 16 -.L075cbc_decrypt: +.L077cbc_decrypt: cmpl $80,%eax - jbe .L080cbc_dec_tail + jbe .L082cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L081cbc_dec_loop6_enter + jmp .L083cbc_dec_loop6_enter .align 16 -.L082cbc_dec_loop6: +.L084cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L081cbc_dec_loop6_enter: +.L083cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1920,28 +1952,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L082cbc_dec_loop6 + ja .L084cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L083cbc_dec_clear_tail_collected + jle .L085cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L080cbc_dec_tail: +.L082cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L084cbc_dec_one + jbe .L086cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L085cbc_dec_two + jbe .L087cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L086cbc_dec_three + jbe .L088cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L087cbc_dec_four + jbe .L089cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1968,26 +2000,26 @@ aesni_cbc_encrypt: movaps %xmm6,%xmm2 pxor %xmm6,%xmm6 subl $80,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L084cbc_dec_one: +.L086cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L089dec1_loop_16: +.L091dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L089dec1_loop_16 + jnz .L091dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L085cbc_dec_two: +.L087cbc_dec_two: call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1997,9 +2029,9 @@ aesni_cbc_encrypt: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L086cbc_dec_three: +.L088cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -2012,9 +2044,9 @@ aesni_cbc_encrypt: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L087cbc_dec_four: +.L089cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -2032,21 +2064,21 @@ aesni_cbc_encrypt: movaps %xmm5,%xmm2 pxor %xmm5,%xmm5 subl $64,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L083cbc_dec_clear_tail_collected: +.L085cbc_dec_clear_tail_collected: pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 -.L088cbc_dec_tail_collected: +.L090cbc_dec_tail_collected: andl $15,%eax - jnz .L090cbc_dec_tail_partial + jnz .L092cbc_dec_tail_partial movups %xmm2,(%edi) pxor %xmm0,%xmm0 - jmp .L079cbc_ret + jmp .L081cbc_ret .align 16 -.L090cbc_dec_tail_partial: +.L092cbc_dec_tail_partial: movaps %xmm2,(%esp) pxor %xmm0,%xmm0 movl $16,%ecx @@ -2054,20 +2086,20 @@ aesni_cbc_encrypt: subl %eax,%ecx .long 2767451785 movdqa %xmm2,(%esp) -.L079cbc_ret: +.L081cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp pxor %xmm2,%xmm2 pxor %xmm1,%xmm1 movups %xmm7,(%ebp) pxor %xmm7,%xmm7 -.L074cbc_abort: +.L076cbc_abort: popl %edi popl %esi popl %ebx popl %ebp ret -.size aesni_cbc_encrypt,.-.L_aesni_cbc_encrypt_begin +.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin .hidden _aesni_set_encrypt_key .type _aesni_set_encrypt_key,@function .align 16 @@ -2075,13 +2107,13 @@ _aesni_set_encrypt_key: pushl %ebp pushl %ebx testl %eax,%eax - jz .L091bad_pointer + jz .L093bad_pointer testl %edx,%edx - jz .L091bad_pointer - call .L092pic -.L092pic: + jz .L093bad_pointer + call .L094pic +.L094pic: popl %ebx - leal .Lkey_const-.L092pic(%ebx),%ebx + leal .Lkey_const-.L094pic(%ebx),%ebx leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 @@ -2089,45 +2121,45 @@ _aesni_set_encrypt_key: leal 16(%edx),%edx andl $268437504,%ebp cmpl $256,%ecx - je .L09314rounds + je .L09514rounds cmpl $192,%ecx - je .L09412rounds + je .L09612rounds cmpl $128,%ecx - jne .L095bad_keybits + jne .L097bad_keybits .align 16 -.L09610rounds: +.L09810rounds: cmpl $268435456,%ebp - je .L09710rounds_alt + je .L09910rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L098key_128_cold + call .L100key_128_cold .byte 102,15,58,223,200,2 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,4 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,8 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,16 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,32 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,64 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,128 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,27 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,54 - call .L099key_128 + call .L101key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L099key_128: +.L101key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L098key_128_cold: +.L100key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2136,13 +2168,13 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L09710rounds_alt: +.L09910rounds_alt: movdqa (%ebx),%xmm5 movl $8,%ecx movdqa 32(%ebx),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) -.L101loop_key128: +.L103loop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 @@ -2158,7 +2190,7 @@ _aesni_set_encrypt_key: movdqu %xmm0,-16(%edx) movdqa %xmm0,%xmm2 decl %ecx - jnz .L101loop_key128 + jnz .L103loop_key128 movdqa 48(%ebx),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 @@ -2186,41 +2218,41 @@ _aesni_set_encrypt_key: movdqu %xmm0,16(%edx) movl $9,%ecx movl %ecx,96(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L09412rounds: +.L09612rounds: movq 16(%eax),%xmm2 cmpl $268435456,%ebp - je .L10212rounds_alt + je .L10412rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L103key_192a_cold + call .L105key_192a_cold .byte 102,15,58,223,202,2 - call .L104key_192b + call .L106key_192b .byte 102,15,58,223,202,4 - call .L105key_192a + call .L107key_192a .byte 102,15,58,223,202,8 - call .L104key_192b + call .L106key_192b .byte 102,15,58,223,202,16 - call .L105key_192a + call .L107key_192a .byte 102,15,58,223,202,32 - call .L104key_192b + call .L106key_192b .byte 102,15,58,223,202,64 - call .L105key_192a + call .L107key_192a .byte 102,15,58,223,202,128 - call .L104key_192b + call .L106key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L105key_192a: +.L107key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L103key_192a_cold: +.L105key_192a_cold: movaps %xmm2,%xmm5 -.L106key_192b_warm: +.L108key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2234,21 +2266,21 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L104key_192b: +.L106key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L106key_192b_warm + jmp .L108key_192b_warm .align 16 -.L10212rounds_alt: +.L10412rounds_alt: movdqa 16(%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $8,%ecx movdqu %xmm0,-16(%edx) -.L107loop_key192: +.L109loop_key192: movq %xmm2,(%edx) movdqa %xmm2,%xmm1 .byte 102,15,56,0,213 @@ -2270,54 +2302,54 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 movdqu %xmm0,-16(%edx) decl %ecx - jnz .L107loop_key192 + jnz .L109loop_key192 movl $11,%ecx movl %ecx,32(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L09314rounds: +.L09514rounds: movups 16(%eax),%xmm2 leal 16(%edx),%edx cmpl $268435456,%ebp - je .L10814rounds_alt + je .L11014rounds_alt movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L109key_256a_cold + call .L111key_256a_cold .byte 102,15,58,223,200,1 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,2 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,2 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,4 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,4 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,8 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,8 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,16 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,16 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,32 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,32 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,64 - call .L111key_256a + call .L113key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - jmp .L100good_key + jmp .L102good_key .align 16 -.L111key_256a: +.L113key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L109key_256a_cold: +.L111key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2326,7 +2358,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L110key_256b: +.L112key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2337,14 +2369,14 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm2 ret .align 16 -.L10814rounds_alt: +.L11014rounds_alt: movdqa (%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $7,%ecx movdqu %xmm0,-32(%edx) movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) -.L112loop_key256: +.L114loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 @@ -2358,7 +2390,7 @@ _aesni_set_encrypt_key: pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) decl %ecx - jz .L113done_key256 + jz .L115done_key256 pshufd $255,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -2373,11 +2405,11 @@ _aesni_set_encrypt_key: movdqu %xmm2,16(%edx) leal 32(%edx),%edx movdqa %xmm2,%xmm1 - jmp .L112loop_key256 -.L113done_key256: + jmp .L114loop_key256 +.L115done_key256: movl $13,%ecx movl %ecx,16(%edx) -.L100good_key: +.L102good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2389,37 +2421,49 @@ _aesni_set_encrypt_key: popl %ebp ret .align 4 -.L091bad_pointer: +.L093bad_pointer: movl $-1,%eax popl %ebx popl %ebp ret .align 4 -.L095bad_keybits: +.L097bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx popl %ebp ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key -.globl aesni_set_encrypt_key -.hidden aesni_set_encrypt_key -.type aesni_set_encrypt_key,@function +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function .align 16 -aesni_set_encrypt_key: -.L_aesni_set_encrypt_key_begin: +aes_hw_set_encrypt_key: +.L_aes_hw_set_encrypt_key_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L116pic +.L116pic: + popl %ebx + leal BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx call _aesni_set_encrypt_key ret -.size aesni_set_encrypt_key,.-.L_aesni_set_encrypt_key_begin -.globl aesni_set_decrypt_key -.hidden aesni_set_decrypt_key -.type aesni_set_decrypt_key,@function +.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function .align 16 -aesni_set_decrypt_key: -.L_aesni_set_decrypt_key_begin: +aes_hw_set_decrypt_key: +.L_aes_hw_set_decrypt_key_begin: movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -2427,7 +2471,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L114dec_key_ret + jnz .L117dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2435,7 +2479,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L115dec_key_inverse: +.L118dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2445,16 +2489,16 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L115dec_key_inverse + ja .L118dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 xorl %eax,%eax -.L114dec_key_ret: +.L117dec_key_ret: ret -.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin +.size aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin .align 64 .Lkey_const: .long 202313229,202313229,202313229,202313229 @@ -2466,3 +2510,4 @@ aesni_set_decrypt_key: .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S index cc067f717e..4a6ccfbfac 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_add_words .hidden bn_mul_add_words @@ -1535,3 +1541,4 @@ bn_sub_part_words: ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S index 56834d0a6f..837b0cb5c7 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_comba8 .hidden bn_mul_comba8 @@ -1257,3 +1263,4 @@ bn_sqr_comba4: ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S new file mode 100644 index 0000000000..3e5f2d7e54 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S @@ -0,0 +1,294 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl gcm_gmult_ssse3 +.hidden gcm_gmult_ssse3 +.type gcm_gmult_ssse3,@function +.align 16 +gcm_gmult_ssse3: +.L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call .L000pic_point +.L000pic_point: + popl %eax + movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7 + movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2 +.byte 102,15,56,0,199 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L001loop_row_1: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L002loop_row_2: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +.L003loop_row_3: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,0,215 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin +.globl gcm_ghash_ssse3 +.hidden gcm_ghash_ssse3 +.type gcm_ghash_ssse3,@function +.align 16 +gcm_ghash_ssse3: +.L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call .L004pic_point +.L004pic_point: + popl %ebx + movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7 + andl $-16,%ecx +.byte 102,15,56,0,199 + pxor %xmm3,%xmm3 +.L005loop_ghash: + movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 +.byte 102,15,56,0,207 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +.L006loop_row_4: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L007loop_row_5: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +.L008loop_row_6: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz .L005loop_ghash +.byte 102,15,56,0,199 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin +.align 16 +.Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 16 +.Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S index a384d9a039..7016235c0a 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl gcm_gmult_4bit_mmx .hidden gcm_gmult_4bit_mmx @@ -1066,3 +1072,4 @@ gcm_ghash_clmul: .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S index 7237f95bec..6de8ff886a 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl md5_block_asm_data_order .hidden md5_block_asm_data_order @@ -679,3 +685,4 @@ md5_block_asm_data_order: ret .size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S index 2c022ec4af..4449e38f72 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl sha1_block_data_order .hidden sha1_block_data_order @@ -3799,3 +3805,4 @@ _sha1_block_data_order_avx: .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S index 984758f3b2..f61fa3df72 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl sha256_block_data_order .hidden sha256_block_data_order @@ -5558,3 +5564,4 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S index 3617ce48b4..89fb50b4ca 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl sha512_block_data_order .hidden sha512_block_data_order @@ -2828,3 +2834,4 @@ sha512_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S index 0417b7e353..8807116950 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S @@ -1,5 +1,13 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text +#ifdef BORINGSSL_DISPATCH_TEST +#endif .align 64 .L_vpaes_consts: .long 218628480,235210255,168496130,67568393 @@ -477,6 +485,18 @@ vpaes_set_encrypt_key: pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L016pic +.L016pic: + popl %ebx + leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%eax @@ -490,9 +510,9 @@ vpaes_set_encrypt_key: movl %ebx,240(%edx) movl $48,%ecx movl $0,%edi - leal .L_vpaes_consts+0x30-.L016pic_point,%ebp + leal .L_vpaes_consts+0x30-.L017pic_point,%ebp call _vpaes_schedule_core -.L016pic_point: +.L017pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -529,9 +549,9 @@ vpaes_set_decrypt_key: shrl $1,%ecx andl $32,%ecx xorl $32,%ecx - leal .L_vpaes_consts+0x30-.L017pic_point,%ebp + leal .L_vpaes_consts+0x30-.L018pic_point,%ebp call _vpaes_schedule_core -.L017pic_point: +.L018pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -550,9 +570,21 @@ vpaes_encrypt: pushl %ebx pushl %esi pushl %edi - leal .L_vpaes_consts+0x30-.L018pic_point,%ebp +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L019pic +.L019pic: + popl %ebx + leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal .L_vpaes_consts+0x30-.L020pic_point,%ebp call _vpaes_preheat -.L018pic_point: +.L020pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -580,9 +612,9 @@ vpaes_decrypt: pushl %ebx pushl %esi pushl %edi - leal .L_vpaes_consts+0x30-.L019pic_point,%ebp + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp call _vpaes_preheat -.L019pic_point: +.L021pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -615,7 +647,7 @@ vpaes_cbc_encrypt: movl 28(%esp),%eax movl 32(%esp),%edx subl $16,%eax - jc .L020cbc_abort + jc .L022cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -628,14 +660,14 @@ vpaes_cbc_encrypt: movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal .L_vpaes_consts+0x30-.L021pic_point,%ebp + leal .L_vpaes_consts+0x30-.L023pic_point,%ebp call _vpaes_preheat -.L021pic_point: +.L023pic_point: cmpl $0,%ecx - je .L022cbc_dec_loop - jmp .L023cbc_enc_loop + je .L024cbc_dec_loop + jmp .L025cbc_enc_loop .align 16 -.L023cbc_enc_loop: +.L025cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call _vpaes_encrypt_core @@ -645,10 +677,10 @@ vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L023cbc_enc_loop - jmp .L024cbc_done + jnc .L025cbc_enc_loop + jmp .L026cbc_done .align 16 -.L022cbc_dec_loop: +.L024cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -660,12 +692,12 @@ vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L022cbc_dec_loop -.L024cbc_done: + jnc .L024cbc_dec_loop +.L026cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) -.L020cbc_abort: +.L022cbc_abort: popl %edi popl %esi popl %ebx @@ -673,3 +705,4 @@ vpaes_cbc_encrypt: ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S index 3fb668826b..f2c6fde7c6 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_mont .hidden bn_mul_mont @@ -446,16 +452,18 @@ bn_mul_mont: leal 1(%edx),%edx jge .L017sub sbbl $0,%eax - andl %eax,%esi - notl %eax - movl %edi,%ebp - andl %eax,%ebp - orl %ebp,%esi + movl $-1,%edx + xorl %eax,%edx + jmp .L018copy .align 16 .L018copy: - movl (%esi,%ebx,4),%eax - movl %eax,(%edi,%ebx,4) + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) decl %ebx jge .L018copy movl 24(%esp),%esp @@ -473,3 +481,4 @@ bn_mul_mont: .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S b/packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S new file mode 100644 index 0000000000..13eb677c97 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S @@ -0,0 +1,206 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl abi_test_trampoline +.hidden abi_test_trampoline +.type abi_test_trampoline,@function +.align 16 +abi_test_trampoline: +.L_abi_test_trampoline_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%ecx + movl (%ecx),%esi + movl 4(%ecx),%edi + movl 8(%ecx),%ebx + movl 12(%ecx),%ebp + subl $44,%esp + movl 72(%esp),%eax + xorl %ecx,%ecx +.L000loop: + cmpl 76(%esp),%ecx + jae .L001loop_done + movl (%eax,%ecx,4),%edx + movl %edx,(%esp,%ecx,4) + addl $1,%ecx + jmp .L000loop +.L001loop_done: + call *64(%esp) + addl $44,%esp + movl 24(%esp),%ecx + movl %esi,(%ecx) + movl %edi,4(%ecx) + movl %ebx,8(%ecx) + movl %ebp,12(%ecx) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size abi_test_trampoline,.-.L_abi_test_trampoline_begin +.globl abi_test_get_and_clear_direction_flag +.hidden abi_test_get_and_clear_direction_flag +.type abi_test_get_and_clear_direction_flag,@function +.align 16 +abi_test_get_and_clear_direction_flag: +.L_abi_test_get_and_clear_direction_flag_begin: + pushfl + popl %eax + andl $1024,%eax + shrl $10,%eax + cld + ret +.size abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin +.globl abi_test_set_direction_flag +.hidden abi_test_set_direction_flag +.type abi_test_set_direction_flag,@function +.align 16 +abi_test_set_direction_flag: +.L_abi_test_set_direction_flag_begin: + std + ret +.size abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin +.globl abi_test_clobber_eax +.hidden abi_test_clobber_eax +.type abi_test_clobber_eax,@function +.align 16 +abi_test_clobber_eax: +.L_abi_test_clobber_eax_begin: + xorl %eax,%eax + ret +.size abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin +.globl abi_test_clobber_ebx +.hidden abi_test_clobber_ebx +.type abi_test_clobber_ebx,@function +.align 16 +abi_test_clobber_ebx: +.L_abi_test_clobber_ebx_begin: + xorl %ebx,%ebx + ret +.size abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin +.globl abi_test_clobber_ecx +.hidden abi_test_clobber_ecx +.type abi_test_clobber_ecx,@function +.align 16 +abi_test_clobber_ecx: +.L_abi_test_clobber_ecx_begin: + xorl %ecx,%ecx + ret +.size abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin +.globl abi_test_clobber_edx +.hidden abi_test_clobber_edx +.type abi_test_clobber_edx,@function +.align 16 +abi_test_clobber_edx: +.L_abi_test_clobber_edx_begin: + xorl %edx,%edx + ret +.size abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin +.globl abi_test_clobber_edi +.hidden abi_test_clobber_edi +.type abi_test_clobber_edi,@function +.align 16 +abi_test_clobber_edi: +.L_abi_test_clobber_edi_begin: + xorl %edi,%edi + ret +.size abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin +.globl abi_test_clobber_esi +.hidden abi_test_clobber_esi +.type abi_test_clobber_esi,@function +.align 16 +abi_test_clobber_esi: +.L_abi_test_clobber_esi_begin: + xorl %esi,%esi + ret +.size abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin +.globl abi_test_clobber_ebp +.hidden abi_test_clobber_ebp +.type abi_test_clobber_ebp,@function +.align 16 +abi_test_clobber_ebp: +.L_abi_test_clobber_ebp_begin: + xorl %ebp,%ebp + ret +.size abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin +.globl abi_test_clobber_xmm0 +.hidden abi_test_clobber_xmm0 +.type abi_test_clobber_xmm0,@function +.align 16 +abi_test_clobber_xmm0: +.L_abi_test_clobber_xmm0_begin: + pxor %xmm0,%xmm0 + ret +.size abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin +.globl abi_test_clobber_xmm1 +.hidden abi_test_clobber_xmm1 +.type abi_test_clobber_xmm1,@function +.align 16 +abi_test_clobber_xmm1: +.L_abi_test_clobber_xmm1_begin: + pxor %xmm1,%xmm1 + ret +.size abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin +.globl abi_test_clobber_xmm2 +.hidden abi_test_clobber_xmm2 +.type abi_test_clobber_xmm2,@function +.align 16 +abi_test_clobber_xmm2: +.L_abi_test_clobber_xmm2_begin: + pxor %xmm2,%xmm2 + ret +.size abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin +.globl abi_test_clobber_xmm3 +.hidden abi_test_clobber_xmm3 +.type abi_test_clobber_xmm3,@function +.align 16 +abi_test_clobber_xmm3: +.L_abi_test_clobber_xmm3_begin: + pxor %xmm3,%xmm3 + ret +.size abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin +.globl abi_test_clobber_xmm4 +.hidden abi_test_clobber_xmm4 +.type abi_test_clobber_xmm4,@function +.align 16 +abi_test_clobber_xmm4: +.L_abi_test_clobber_xmm4_begin: + pxor %xmm4,%xmm4 + ret +.size abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin +.globl abi_test_clobber_xmm5 +.hidden abi_test_clobber_xmm5 +.type abi_test_clobber_xmm5,@function +.align 16 +abi_test_clobber_xmm5: +.L_abi_test_clobber_xmm5_begin: + pxor %xmm5,%xmm5 + ret +.size abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin +.globl abi_test_clobber_xmm6 +.hidden abi_test_clobber_xmm6 +.type abi_test_clobber_xmm6,@function +.align 16 +abi_test_clobber_xmm6: +.L_abi_test_clobber_xmm6_begin: + pxor %xmm6,%xmm6 + ret +.size abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin +.globl abi_test_clobber_xmm7 +.hidden abi_test_clobber_xmm7 +.type abi_test_clobber_xmm7,@function +.align 16 +abi_test_clobber_xmm7: +.L_abi_test_clobber_xmm7_begin: + pxor %xmm7,%xmm7 + ret +.size abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S index 62dc77999a..b76713398d 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -38,6 +50,7 @@ .type ChaCha20_ctr32,@function .align 64 ChaCha20_ctr32: +.cfi_startproc cmpq $0,%rdx je .Lno_data movq OPENSSL_ia32cap_P+4(%rip),%r10 @@ -45,12 +58,25 @@ ChaCha20_ctr32: jnz .LChaCha20_ssse3 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-56 subq $64+24,%rsp +.cfi_adjust_cfa_offset 88 .Lctr32_body: @@ -291,20 +317,30 @@ ChaCha20_ctr32: .Ldone: leaq 64+24+48(%rsp),%rsi movq -48(%rsi),%r15 +.cfi_restore r15 movq -40(%rsi),%r14 +.cfi_restore r14 movq -32(%rsi),%r13 +.cfi_restore r13 movq -24(%rsi),%r12 +.cfi_restore r12 movq -16(%rsi),%rbp +.cfi_restore rbp movq -8(%rsi),%rbx +.cfi_restore rbx leaq (%rsi),%rsp +.cfi_adjust_cfa_offset -136 .Lno_data: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 .type ChaCha20_ssse3,@function .align 32 ChaCha20_ssse3: .LChaCha20_ssse3: +.cfi_startproc movq %rsp,%r9 +.cfi_def_cfa_register r9 cmpq $128,%rdx ja .LChaCha20_4x @@ -430,14 +466,18 @@ ChaCha20_ssse3: .Ldone_ssse3: leaq (%r9),%rsp +.cfi_def_cfa_register rsp .Lssse3_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 .type ChaCha20_4x,@function .align 32 ChaCha20_4x: .LChaCha20_4x: +.cfi_startproc movq %rsp,%r9 +.cfi_def_cfa_register r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -978,14 +1018,18 @@ ChaCha20_4x: .Ldone4x: leaq (%r9),%rsp +.cfi_def_cfa_register rsp .L4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x .type ChaCha20_8x,@function .align 32 ChaCha20_8x: .LChaCha20_8x: +.cfi_startproc movq %rsp,%r9 +.cfi_def_cfa_register r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper @@ -1580,7 +1624,10 @@ ChaCha20_8x: .Ldone8x: vzeroall leaq (%r9),%rsp +.cfi_def_cfa_register rsp .L8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S index 42e25f4817..a22bee8fcf 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .data .align 16 @@ -3064,3 +3076,4 @@ aes256gcmsiv_kdf: .cfi_endproc .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index a6f5e07d9c..e313348808 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8972,3 +8984,4 @@ seal_avx2_short_tail: jmp seal_sse_tail_16 .cfi_endproc #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S index ff87f9824e..47a69ec862 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .type _x86_64_AES_encrypt,@function .align 16 @@ -156,6 +168,7 @@ _x86_64_AES_encrypt: .type _x86_64_AES_encrypt_compact,@function .align 16 _x86_64_AES_encrypt_compact: +.cfi_startproc leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -325,20 +338,29 @@ _x86_64_AES_encrypt_compact: xorl 8(%r15),%ecx xorl 12(%r15),%edx .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact .align 16 -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,@function -.hidden asm_AES_encrypt -asm_AES_encrypt: +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,@function +.hidden aes_nohw_encrypt +aes_nohw_encrypt: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 leaq -63(%rdx),%rcx @@ -351,6 +373,7 @@ asm_AES_encrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 .Lenc_prologue: movq %rdx,%r15 @@ -377,21 +400,30 @@ asm_AES_encrypt: movq 16(%rsp),%r9 movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lenc_epilogue: .byte 0xf3,0xc3 -.size asm_AES_encrypt,.-asm_AES_encrypt +.cfi_endproc +.size aes_nohw_encrypt,.-aes_nohw_encrypt .type _x86_64_AES_decrypt,@function .align 16 _x86_64_AES_decrypt: @@ -550,6 +582,7 @@ _x86_64_AES_decrypt: .type _x86_64_AES_decrypt_compact,@function .align 16 _x86_64_AES_decrypt_compact: +.cfi_startproc leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -771,20 +804,29 @@ _x86_64_AES_decrypt_compact: xorl 8(%r15),%ecx xorl 12(%r15),%edx .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact .align 16 -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,@function -.hidden asm_AES_decrypt -asm_AES_decrypt: +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,@function +.hidden aes_nohw_decrypt +aes_nohw_decrypt: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 leaq -63(%rdx),%rcx @@ -797,6 +839,7 @@ asm_AES_decrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 .Ldec_prologue: movq %rdx,%r15 @@ -825,47 +868,75 @@ asm_AES_decrypt: movq 16(%rsp),%r9 movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Ldec_epilogue: .byte 0xf3,0xc3 -.size asm_AES_decrypt,.-asm_AES_decrypt +.cfi_endproc +.size aes_nohw_decrypt,.-aes_nohw_decrypt .align 16 -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,@function -asm_AES_set_encrypt_key: +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,@function +aes_nohw_set_encrypt_key: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $8,%rsp +.cfi_adjust_cfa_offset 8 .Lenc_key_prologue: call _x86_64_AES_set_encrypt_key movq 40(%rsp),%rbp +.cfi_restore %rbp movq 48(%rsp),%rbx +.cfi_restore %rbx addq $56,%rsp +.cfi_adjust_cfa_offset -56 .Lenc_key_epilogue: .byte 0xf3,0xc3 -.size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key +.cfi_endproc +.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key .type _x86_64_AES_set_encrypt_key,@function .align 16 _x86_64_AES_set_encrypt_key: +.cfi_startproc movl %esi,%ecx movq %rdi,%rsi movq %rdx,%rdi @@ -1101,19 +1172,34 @@ _x86_64_AES_set_encrypt_key: movq $-1,%rax .Lexit: .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key .align 16 -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,@function -asm_AES_set_decrypt_key: +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,@function +aes_nohw_set_decrypt_key: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 pushq %rdx +.cfi_adjust_cfa_offset 8 .Ldec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1281,32 +1367,56 @@ asm_AES_set_decrypt_key: xorq %rax,%rax .Labort: movq 8(%rsp),%r15 +.cfi_restore %r15 movq 16(%rsp),%r14 +.cfi_restore %r14 movq 24(%rsp),%r13 +.cfi_restore %r13 movq 32(%rsp),%r12 +.cfi_restore %r12 movq 40(%rsp),%rbp +.cfi_restore %rbp movq 48(%rsp),%rbx +.cfi_restore %rbx addq $56,%rsp +.cfi_adjust_cfa_offset -56 .Ldec_key_epilogue: .byte 0xf3,0xc3 -.size asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key +.cfi_endproc +.size aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key .align 16 -.globl asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.type asm_AES_cbc_encrypt,@function +.globl aes_nohw_cbc_encrypt +.hidden aes_nohw_cbc_encrypt +.type aes_nohw_cbc_encrypt,@function .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P -.hidden asm_AES_cbc_encrypt -asm_AES_cbc_encrypt: +.hidden aes_nohw_cbc_encrypt +aes_nohw_cbc_encrypt: +.cfi_startproc cmpq $0,%rdx je .Lcbc_epilogue pushfq + + +.cfi_adjust_cfa_offset 8 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-32 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-40 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-48 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-56 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-64 .Lcbc_prologue: cld @@ -1317,6 +1427,7 @@ asm_AES_cbc_encrypt: cmpq $0,%r9 cmoveq %r10,%r14 +.cfi_remember_state leaq OPENSSL_ia32cap_P(%rip),%r10 movl (%r10),%r10d cmpq $512,%rdx @@ -1352,8 +1463,10 @@ asm_AES_cbc_encrypt: .Lcbc_te_ok: xchgq %rsp,%r15 +.cfi_def_cfa_register %r15 movq %r15,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_fast_body: movq %rdi,24(%rsp) movq %rsi,32(%rsp) @@ -1551,6 +1664,7 @@ asm_AES_cbc_encrypt: .align 16 .Lcbc_slow_prologue: +.cfi_restore_state leaq -88(%rsp),%rbp andq $-64,%rbp @@ -1562,8 +1676,10 @@ asm_AES_cbc_encrypt: subq %r10,%rbp xchgq %rsp,%rbp +.cfi_def_cfa_register %rbp movq %rbp,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_slow_body: @@ -1735,18 +1851,30 @@ asm_AES_cbc_encrypt: .align 16 .Lcbc_exit: movq 16(%rsp),%rsi +.cfi_def_cfa %rsi,64 movq (%rsi),%r15 +.cfi_restore %r15 movq 8(%rsi),%r14 +.cfi_restore %r14 movq 16(%rsi),%r13 +.cfi_restore %r13 movq 24(%rsi),%r12 +.cfi_restore %r12 movq 32(%rsi),%rbp +.cfi_restore %rbp movq 40(%rsi),%rbx +.cfi_restore %rbx leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,16 .Lcbc_popfq: popfq + + +.cfi_adjust_cfa_offset -8 .Lcbc_epilogue: .byte 0xf3,0xc3 -.size asm_AES_cbc_encrypt,.-asm_AES_cbc_encrypt +.cfi_endproc +.size aes_nohw_cbc_encrypt,.-aes_nohw_cbc_encrypt .align 64 .LAES_Te: .long 0xa56363c6,0xa56363c6 @@ -2534,3 +2662,4 @@ asm_AES_cbc_encrypt: .byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S index e7b4c48bef..65ab5c78fe 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .type _aesni_ctr32_ghash_6x,@function @@ -544,6 +556,11 @@ _aesni_ctr32_6x: .align 32 aesni_gcm_encrypt: .cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+2(%rip) +#endif xorq %r10,%r10 @@ -832,3 +849,4 @@ aesni_gcm_encrypt: .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S index 0c980a304b..b98107f369 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -1,12 +1,30 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P -.globl aesni_encrypt -.hidden aesni_encrypt -.type aesni_encrypt,@function +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function .align 16 -aesni_encrypt: +aes_hw_encrypt: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+1(%rip) +#endif movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -25,13 +43,15 @@ aesni_encrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 -.size aesni_encrypt,.-aesni_encrypt +.cfi_endproc +.size aes_hw_encrypt,.-aes_hw_encrypt -.globl aesni_decrypt -.hidden aesni_decrypt -.type aesni_decrypt,@function +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function .align 16 -aesni_decrypt: +aes_hw_decrypt: +.cfi_startproc movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -50,10 +70,12 @@ aesni_decrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 -.size aesni_decrypt, .-aesni_decrypt +.cfi_endproc +.size aes_hw_decrypt, .-aes_hw_decrypt .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -79,10 +101,12 @@ _aesni_encrypt2: .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt2,.-_aesni_encrypt2 .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -108,10 +132,12 @@ _aesni_decrypt2: .byte 102,15,56,223,208 .byte 102,15,56,223,216 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -142,10 +168,12 @@ _aesni_encrypt3: .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt3,.-_aesni_encrypt3 .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -176,10 +204,12 @@ _aesni_decrypt3: .byte 102,15,56,223,216 .byte 102,15,56,223,224 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt3,.-_aesni_decrypt3 .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -216,10 +246,12 @@ _aesni_encrypt4: .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt4,.-_aesni_encrypt4 .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -256,10 +288,12 @@ _aesni_decrypt4: .byte 102,15,56,223,224 .byte 102,15,56,223,232 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt4,.-_aesni_decrypt4 .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -310,10 +344,12 @@ _aesni_encrypt6: .byte 102,15,56,221,240 .byte 102,15,56,221,248 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt6,.-_aesni_encrypt6 .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -364,10 +400,12 @@ _aesni_decrypt6: .byte 102,15,56,223,240 .byte 102,15,56,223,248 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt6,.-_aesni_decrypt6 .type _aesni_encrypt8,@function .align 16 _aesni_encrypt8: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -428,10 +466,12 @@ _aesni_encrypt8: .byte 102,68,15,56,221,192 .byte 102,68,15,56,221,200 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt8,.-_aesni_encrypt8 .type _aesni_decrypt8,@function .align 16 _aesni_decrypt8: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -492,12 +532,14 @@ _aesni_decrypt8: .byte 102,68,15,56,223,192 .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt8,.-_aesni_decrypt8 -.globl aesni_ecb_encrypt -.hidden aesni_ecb_encrypt -.type aesni_ecb_encrypt,@function +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function .align 16 -aesni_ecb_encrypt: +aes_hw_ecb_encrypt: +.cfi_startproc andq $-16,%rdx jz .Lecb_ret @@ -835,174 +877,17 @@ aesni_ecb_encrypt: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 .byte 0xf3,0xc3 -.size aesni_ecb_encrypt,.-aesni_ecb_encrypt -.globl aesni_ccm64_encrypt_blocks -.hidden aesni_ccm64_encrypt_blocks -.type aesni_ccm64_encrypt_blocks,@function +.cfi_endproc +.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function .align 16 -aesni_ccm64_encrypt_blocks: - movl 240(%rcx),%eax - movdqu (%r8),%xmm6 - movdqa .Lincrement64(%rip),%xmm9 - movdqa .Lbswap_mask(%rip),%xmm7 - - shll $4,%eax - movl $16,%r10d - leaq 0(%rcx),%r11 - movdqu (%r9),%xmm3 - movdqa %xmm6,%xmm2 - leaq 32(%rcx,%rax,1),%rcx -.byte 102,15,56,0,247 - subq %rax,%r10 - jmp .Lccm64_enc_outer -.align 16 -.Lccm64_enc_outer: - movups (%r11),%xmm0 - movq %r10,%rax - movups (%rdi),%xmm8 - - xorps %xmm0,%xmm2 - movups 16(%r11),%xmm1 - xorps %xmm8,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%r11),%xmm0 - -.Lccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq %xmm9,%xmm6 - decq %rdx -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - - leaq 16(%rdi),%rdi - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) -.byte 102,15,56,0,215 - leaq 16(%rsi),%rsi - jnz .Lccm64_enc_outer - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks -.globl aesni_ccm64_decrypt_blocks -.hidden aesni_ccm64_decrypt_blocks -.type aesni_ccm64_decrypt_blocks,@function -.align 16 -aesni_ccm64_decrypt_blocks: - movl 240(%rcx),%eax - movups (%r8),%xmm6 - movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm9 - movdqa .Lbswap_mask(%rip),%xmm7 - - movaps %xmm6,%xmm2 - movl %eax,%r10d - movq %rcx,%r11 -.byte 102,15,56,0,247 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_5: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 -.byte 102,15,56,221,209 - shll $4,%r10d - movl $16,%eax - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 - leaq 16(%rdi),%rdi - subq %r10,%rax - leaq 32(%r11,%r10,1),%rcx - movq %rax,%r10 - jmp .Lccm64_dec_outer -.align 16 -.Lccm64_dec_outer: - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi -.byte 102,15,56,0,215 - - subq $1,%rdx - jz .Lccm64_dec_break - - movups (%r11),%xmm0 - movq %r10,%rax - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - xorps %xmm0,%xmm2 - xorps %xmm8,%xmm3 - movups 32(%r11),%xmm0 - jmp .Lccm64_dec2_loop -.align 16 -.Lccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lccm64_dec2_loop - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leaq 16(%rdi),%rdi - jmp .Lccm64_dec_outer - -.align 16 -.Lccm64_dec_break: - - movl 240(%r11),%eax - movups (%r11),%xmm0 - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%r11),%r11 - xorps %xmm8,%xmm3 -.Loop_enc1_6: -.byte 102,15,56,220,217 - decl %eax - movups (%r11),%xmm1 - leaq 16(%r11),%r11 - jnz .Loop_enc1_6 -.byte 102,15,56,221,217 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks -.globl aesni_ctr32_encrypt_blocks -.hidden aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,@function -.align 16 -aesni_ctr32_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit(%rip) +#endif cmpq $1,%rdx jne .Lctr32_bulk @@ -1015,12 +900,12 @@ aesni_ctr32_encrypt_blocks: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_enc1_7: +.Loop_enc1_5: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 + jnz .Loop_enc1_5 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -1033,7 +918,9 @@ aesni_ctr32_encrypt_blocks: .align 16 .Lctr32_bulk: leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $128,%rsp andq $-16,%rsp @@ -1568,1798 +1455,19 @@ aesni_ctr32_encrypt_blocks: movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 movq -8(%r11),%rbp +.cfi_restore %rbp leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lctr32_epilogue: .byte 0xf3,0xc3 -.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks -.globl aesni_xts_encrypt -.hidden aesni_xts_encrypt -.type aesni_xts_encrypt,@function +.cfi_endproc +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function .align 16 -aesni_xts_encrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -.Loop_enc1_8: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz .Loop_enc1_8 -.byte 102,15,56,221,209 - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa .Lxts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc .Lxts_enc_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq .Lxts_magic(%rip),%r8 - jmp .Lxts_enc_grandloop - -.align 32 -.Lxts_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,220,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,220,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,220,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,220,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,220,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp .Lxts_enc_loop6 -.align 32 -.Lxts_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz .Lxts_enc_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,220,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,220,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,220,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,220,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,220,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,221,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,221,92,36,16 -.byte 102,15,56,221,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,221,108,36,48 -.byte 102,15,56,221,116,36,64 -.byte 102,15,56,221,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc .Lxts_enc_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -.Lxts_enc_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - addq $96,%rdx - jz .Lxts_enc_done - - pxor %xmm0,%xmm11 - cmpq $0x20,%rdx - jb .Lxts_enc_one - pxor %xmm0,%xmm12 - je .Lxts_enc_two - - pxor %xmm0,%xmm13 - cmpq $0x40,%rdx - jb .Lxts_enc_three - pxor %xmm0,%xmm14 - je .Lxts_enc_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm7 - - call _aesni_encrypt6 - - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_9: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_9 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_encrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_encrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_encrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_done: - andq $15,%r9 - jz .Lxts_enc_ret - movq %r9,%rdx - -.Lxts_enc_steal: - movzbl (%rdi),%eax - movzbl -16(%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,-16(%rsi) - movb %cl,0(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz .Lxts_enc_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups -16(%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_10: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_10 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,-16(%rsi) - -.Lxts_enc_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -.Lxts_enc_epilogue: - .byte 0xf3,0xc3 -.size aesni_xts_encrypt,.-aesni_xts_encrypt -.globl aesni_xts_decrypt -.hidden aesni_xts_decrypt -.type aesni_xts_decrypt,@function -.align 16 -aesni_xts_decrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -.Loop_enc1_11: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz .Loop_enc1_11 -.byte 102,15,56,221,209 - xorl %eax,%eax - testq $15,%rdx - setnz %al - shlq $4,%rax - subq %rax,%rdx - - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa .Lxts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc .Lxts_dec_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq .Lxts_magic(%rip),%r8 - jmp .Lxts_dec_grandloop - -.align 32 -.Lxts_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,222,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,222,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,222,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,222,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,222,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp .Lxts_dec_loop6 -.align 32 -.Lxts_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz .Lxts_dec_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,222,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,222,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,222,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,222,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,222,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,223,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,223,92,36,16 -.byte 102,15,56,223,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,223,108,36,48 -.byte 102,15,56,223,116,36,64 -.byte 102,15,56,223,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc .Lxts_dec_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -.Lxts_dec_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 - addq $96,%rdx - jz .Lxts_dec_done - - pxor %xmm0,%xmm12 - cmpq $0x20,%rdx - jb .Lxts_dec_one - pxor %xmm0,%xmm13 - je .Lxts_dec_two - - pxor %xmm0,%xmm14 - cmpq $0x40,%rdx - jb .Lxts_dec_three - je .Lxts_dec_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - - call _aesni_decrypt6 - - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm14 - movdqu %xmm5,48(%rsi) - pcmpgtd %xmm15,%xmm14 - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - pshufd $0x13,%xmm14,%xmm11 - andq $15,%r9 - jz .Lxts_dec_ret - - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm11 - pxor %xmm15,%xmm11 - jmp .Lxts_dec_done2 - -.align 16 -.Lxts_dec_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_12: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_12 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - movdqa %xmm12,%xmm11 - leaq 16(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_decrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm13,%xmm11 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_decrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm14,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_decrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqa %xmm15,%xmm11 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_done: - andq $15,%r9 - jz .Lxts_dec_ret -.Lxts_dec_done2: - movq %r9,%rdx - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rdi),%xmm2 - xorps %xmm11,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_13: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_13 -.byte 102,15,56,223,209 - xorps %xmm11,%xmm2 - movups %xmm2,(%rsi) - -.Lxts_dec_steal: - movzbl 16(%rdi),%eax - movzbl (%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,(%rsi) - movb %cl,16(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz .Lxts_dec_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_14: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_14 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - -.Lxts_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -.Lxts_dec_epilogue: - .byte 0xf3,0xc3 -.size aesni_xts_decrypt,.-aesni_xts_decrypt -.globl aesni_ocb_encrypt -.hidden aesni_ocb_encrypt -.type aesni_ocb_encrypt,@function -.align 32 -aesni_ocb_encrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz .Locb_enc_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - subq $1,%rdx - jz .Locb_enc_done - -.Locb_enc_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc .Locb_enc_short - jmp .Locb_enc_grandloop - -.align 32 -.Locb_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_encrypt6 - - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc .Locb_enc_grandloop - -.Locb_enc_short: - addq $6,%rdx - jz .Locb_enc_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb .Locb_enc_one - movdqu 16(%rdi),%xmm3 - je .Locb_enc_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb .Locb_enc_three - movdqu 48(%rdi),%xmm5 - je .Locb_enc_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_encrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_one: - movdqa %xmm10,%xmm7 - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - jmp .Locb_enc_done - -.align 16 -.Locb_enc_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_three: - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_four: - call __ocb_encrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - -.Locb_enc_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Locb_enc_epilogue: - .byte 0xf3,0xc3 -.size aesni_ocb_encrypt,.-aesni_ocb_encrypt - -.type __ocb_encrypt6,@function -.align 32 -__ocb_encrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm6,%xmm8 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm8 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp .Locb_enc_loop6 - -.align 32 -.Locb_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,221,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - .byte 0xf3,0xc3 -.size __ocb_encrypt6,.-__ocb_encrypt6 - -.type __ocb_encrypt4,@function -.align 32 -__ocb_encrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups 64(%r11),%xmm0 - jmp .Locb_enc_loop4 - -.align 32 -.Locb_enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,221,210 -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - .byte 0xf3,0xc3 -.size __ocb_encrypt4,.-__ocb_encrypt4 - -.type __ocb_encrypt1,@function -.align 32 -__ocb_encrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm2,%xmm8 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,220,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,220,208 - movups 64(%r11),%xmm0 - jmp .Locb_enc_loop1 - -.align 32 -.Locb_enc_loop1: -.byte 102,15,56,220,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop1 - -.byte 102,15,56,220,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,221,215 - .byte 0xf3,0xc3 -.size __ocb_encrypt1,.-__ocb_encrypt1 - -.globl aesni_ocb_decrypt -.hidden aesni_ocb_decrypt -.type aesni_ocb_decrypt,@function -.align 32 -aesni_ocb_decrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz .Locb_dec_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm8 - leaq 16(%rsi),%rsi - subq $1,%rdx - jz .Locb_dec_done - -.Locb_dec_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc .Locb_dec_short - jmp .Locb_dec_grandloop - -.align 32 -.Locb_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_decrypt6 - - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm8 - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc .Locb_dec_grandloop - -.Locb_dec_short: - addq $6,%rdx - jz .Locb_dec_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb .Locb_dec_one - movdqu 16(%rdi),%xmm3 - je .Locb_dec_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb .Locb_dec_three - movdqu 48(%rdi),%xmm5 - je .Locb_dec_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_decrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_one: - movdqa %xmm10,%xmm7 - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - jmp .Locb_dec_done - -.align 16 -.Locb_dec_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_three: - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - movups %xmm4,32(%rsi) - xorps %xmm4,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_four: - call __ocb_decrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - -.Locb_dec_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Locb_dec_epilogue: - .byte 0xf3,0xc3 -.size aesni_ocb_decrypt,.-aesni_ocb_decrypt - -.type __ocb_decrypt6,@function -.align 32 -__ocb_decrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp .Locb_dec_loop6 - -.align 32 -.Locb_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,223,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 -.byte 102,65,15,56,223,255 - .byte 0xf3,0xc3 -.size __ocb_decrypt6,.-__ocb_decrypt6 - -.type __ocb_decrypt4,@function -.align 32 -__ocb_decrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups 64(%r11),%xmm0 - jmp .Locb_dec_loop4 - -.align 32 -.Locb_dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,223,210 -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 - .byte 0xf3,0xc3 -.size __ocb_decrypt4,.-__ocb_decrypt4 - -.type __ocb_decrypt1,@function -.align 32 -__ocb_decrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,222,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,222,208 - movups 64(%r11),%xmm0 - jmp .Locb_dec_loop1 - -.align 32 -.Locb_dec_loop1: -.byte 102,15,56,222,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop1 - -.byte 102,15,56,222,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,223,215 - .byte 0xf3,0xc3 -.size __ocb_decrypt1,.-__ocb_decrypt1 -.globl aesni_cbc_encrypt -.hidden aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function -.align 16 -aesni_cbc_encrypt: +aes_hw_cbc_encrypt: +.cfi_startproc testq %rdx,%rdx jz .Lcbc_ret @@ -3384,12 +1492,12 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm3 leaq 32(%rcx),%rcx xorps %xmm3,%xmm2 -.Loop_enc1_15: +.Loop_enc1_6: .byte 102,15,56,220,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 + jnz .Loop_enc1_6 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -3435,12 +1543,12 @@ aesni_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_16: +.Loop_dec1_7: .byte 102,15,56,222,209 decl %r10d movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_7 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -3453,7 +1561,9 @@ aesni_cbc_encrypt: .align 16 .Lcbc_decrypt_bulk: leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $16,%rsp andq $-16,%rsp movq %rcx,%rbp @@ -3851,12 +1961,12 @@ aesni_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_17: +.Loop_dec1_8: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_17 + jnz .Loop_dec1_8 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -3938,16 +2048,21 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 movq -8(%r11),%rbp +.cfi_restore %rbp leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lcbc_ret: .byte 0xf3,0xc3 -.size aesni_cbc_encrypt,.-aesni_cbc_encrypt -.globl aesni_set_decrypt_key -.hidden aesni_set_decrypt_key -.type aesni_set_decrypt_key,@function +.cfi_endproc +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function .align 16 -aesni_set_decrypt_key: +aes_hw_set_decrypt_key: +.cfi_startproc .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3980,16 +2095,23 @@ aesni_set_decrypt_key: pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_decrypt_key: -.size aesni_set_decrypt_key,.-aesni_set_decrypt_key -.globl aesni_set_encrypt_key -.hidden aesni_set_encrypt_key -.type aesni_set_encrypt_key,@function +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function .align 16 -aesni_set_encrypt_key: +aes_hw_set_encrypt_key: __aesni_set_encrypt_key: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 movq $-1,%rax testq %rdi,%rdi jz .Lenc_key_ret @@ -4283,7 +2405,9 @@ __aesni_set_encrypt_key: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_encrypt_key: .align 16 @@ -4354,7 +2478,7 @@ __aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 -.size aesni_set_encrypt_key,.-aesni_set_encrypt_key +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key .align 64 .Lbswap_mask: @@ -4379,3 +2503,4 @@ __aesni_set_encrypt_key: .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S deleted file mode 100644 index 04b161c995..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ /dev/null @@ -1,2503 +0,0 @@ -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -.text - -.extern asm_AES_encrypt -.hidden asm_AES_encrypt -.extern asm_AES_decrypt -.hidden asm_AES_decrypt - -.type _bsaes_encrypt8,@function -.align 64 -_bsaes_encrypt8: - leaq .LBS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa 80(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp .Lenc_sbox -.align 16 -.Lenc_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -.Lenc_sbox: - pxor %xmm5,%xmm4 - pxor %xmm0,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm1,%xmm5 - pxor %xmm15,%xmm4 - - pxor %xmm2,%xmm5 - pxor %xmm6,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm3,%xmm2 - pxor %xmm4,%xmm3 - pxor %xmm0,%xmm2 - - pxor %xmm6,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm6,%xmm10 - movdqa %xmm0,%xmm9 - movdqa %xmm4,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm11 - - pxor %xmm3,%xmm10 - pxor %xmm1,%xmm9 - pxor %xmm2,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm3,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm15,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm2,%xmm11 - pxor %xmm15,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm6,%xmm12 - movdqa %xmm4,%xmm11 - pxor %xmm0,%xmm12 - pxor %xmm5,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm1,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm3,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm0,%xmm13 - pand %xmm2,%xmm11 - movdqa %xmm6,%xmm14 - pand %xmm15,%xmm12 - pand %xmm4,%xmm13 - por %xmm5,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm5,%xmm11 - movdqa %xmm4,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm5,%xmm9 - pxor %xmm4,%xmm5 - pand %xmm14,%xmm4 - pand %xmm13,%xmm5 - pxor %xmm4,%xmm5 - pxor %xmm9,%xmm4 - pxor %xmm15,%xmm11 - pxor %xmm2,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm2,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm2 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm2,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm11,%xmm5 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm2 - - movdqa %xmm6,%xmm11 - movdqa %xmm0,%xmm7 - pxor %xmm3,%xmm11 - pxor %xmm1,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm3,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm1,%xmm3 - pand %xmm14,%xmm7 - pand %xmm12,%xmm1 - pand %xmm13,%xmm11 - pand %xmm8,%xmm3 - pxor %xmm11,%xmm7 - pxor %xmm1,%xmm3 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm1 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm6,%xmm10 - pxor %xmm0,%xmm6 - pand %xmm14,%xmm0 - pand %xmm13,%xmm6 - pxor %xmm0,%xmm6 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm6 - pxor %xmm11,%xmm3 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm1 - pxor %xmm15,%xmm6 - pxor %xmm5,%xmm0 - pxor %xmm6,%xmm3 - pxor %xmm15,%xmm5 - pxor %xmm0,%xmm15 - - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 - pxor %xmm2,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm4,%xmm3 - - pxor %xmm2,%xmm5 - decl %r10d - jl .Lenc_done - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm3,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm5,%xmm10 - pxor %xmm9,%xmm3 - pshufd $0x93,%xmm2,%xmm11 - pxor %xmm10,%xmm5 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm2 - pshufd $0x93,%xmm1,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm1 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm2,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm5,%xmm11 - pshufd $0x4E,%xmm2,%xmm7 - pxor %xmm1,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm3,%xmm10 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm1,%xmm5 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm12,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm14,%xmm6 - pxor %xmm13,%xmm5 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm1 - movdqa %xmm8,%xmm4 - movdqa 48(%r11),%xmm7 - jnz .Lenc_loop - movdqa 64(%r11),%xmm7 - jmp .Lenc_loop -.align 16 -.Lenc_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm2,%xmm10 - psrlq $1,%xmm2 - pxor %xmm4,%xmm1 - pxor %xmm6,%xmm2 - pand %xmm7,%xmm1 - pand %xmm7,%xmm2 - pxor %xmm1,%xmm4 - psllq $1,%xmm1 - pxor %xmm2,%xmm6 - psllq $1,%xmm2 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm2 - movdqa %xmm3,%xmm9 - psrlq $1,%xmm3 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm5,%xmm3 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm3 - pand %xmm7,%xmm15 - pxor %xmm3,%xmm5 - psllq $1,%xmm3 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm2,%xmm10 - psrlq $2,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm1,%xmm2 - pand %xmm8,%xmm6 - pand %xmm8,%xmm2 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm2,%xmm1 - psllq $2,%xmm2 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm2 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm5,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm5 - psllq $2,%xmm0 - pxor %xmm15,%xmm3 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm5,%xmm9 - psrlq $4,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $4,%xmm3 - pxor %xmm4,%xmm5 - pxor %xmm1,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm4 - psllq $4,%xmm5 - pxor %xmm3,%xmm1 - psllq $4,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm2,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm2 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 -.size _bsaes_encrypt8,.-_bsaes_encrypt8 - -.type _bsaes_decrypt8,@function -.align 64 -_bsaes_decrypt8: - leaq .LBS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa -48(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp .Ldec_sbox -.align 16 -.Ldec_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -.Ldec_sbox: - pxor %xmm3,%xmm2 - - pxor %xmm6,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm3,%xmm5 - pxor %xmm5,%xmm6 - pxor %xmm6,%xmm0 - - pxor %xmm0,%xmm15 - pxor %xmm4,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm15,%xmm4 - pxor %xmm2,%xmm0 - movdqa %xmm2,%xmm10 - movdqa %xmm6,%xmm9 - movdqa %xmm0,%xmm8 - movdqa %xmm3,%xmm12 - movdqa %xmm4,%xmm11 - - pxor %xmm15,%xmm10 - pxor %xmm3,%xmm9 - pxor %xmm5,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm15,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm1,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm5,%xmm11 - pxor %xmm1,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm2,%xmm12 - movdqa %xmm0,%xmm11 - pxor %xmm6,%xmm12 - pxor %xmm4,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm3,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm15,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm6,%xmm13 - pand %xmm5,%xmm11 - movdqa %xmm2,%xmm14 - pand %xmm1,%xmm12 - pand %xmm0,%xmm13 - por %xmm4,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm4,%xmm11 - movdqa %xmm0,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm4,%xmm9 - pxor %xmm0,%xmm4 - pand %xmm14,%xmm0 - pand %xmm13,%xmm4 - pxor %xmm0,%xmm4 - pxor %xmm9,%xmm0 - pxor %xmm1,%xmm11 - pxor %xmm5,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm1,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm5,%xmm1 - pand %xmm14,%xmm7 - pand %xmm12,%xmm5 - pand %xmm13,%xmm11 - pand %xmm8,%xmm1 - pxor %xmm11,%xmm7 - pxor %xmm5,%xmm1 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm5 - pxor %xmm11,%xmm4 - pxor %xmm11,%xmm1 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm5 - - movdqa %xmm2,%xmm11 - movdqa %xmm6,%xmm7 - pxor %xmm15,%xmm11 - pxor %xmm3,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm3,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm3 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm3,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm3 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm2,%xmm10 - pxor %xmm6,%xmm2 - pand %xmm14,%xmm6 - pand %xmm13,%xmm2 - pxor %xmm6,%xmm2 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm4,%xmm5 - - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm6,%xmm4 - pxor %xmm1,%xmm3 - pxor %xmm15,%xmm6 - pxor %xmm4,%xmm3 - pxor %xmm5,%xmm2 - pxor %xmm0,%xmm5 - pxor %xmm3,%xmm2 - - pxor %xmm15,%xmm3 - pxor %xmm2,%xmm6 - decl %r10d - jl .Ldec_done - - pshufd $0x4E,%xmm15,%xmm7 - pshufd $0x4E,%xmm2,%xmm13 - pxor %xmm15,%xmm7 - pshufd $0x4E,%xmm4,%xmm14 - pxor %xmm2,%xmm13 - pshufd $0x4E,%xmm0,%xmm8 - pxor %xmm4,%xmm14 - pshufd $0x4E,%xmm5,%xmm9 - pxor %xmm0,%xmm8 - pshufd $0x4E,%xmm3,%xmm10 - pxor %xmm5,%xmm9 - pxor %xmm13,%xmm15 - pxor %xmm13,%xmm0 - pshufd $0x4E,%xmm1,%xmm11 - pxor %xmm3,%xmm10 - pxor %xmm7,%xmm5 - pxor %xmm8,%xmm3 - pshufd $0x4E,%xmm6,%xmm12 - pxor %xmm1,%xmm11 - pxor %xmm14,%xmm0 - pxor %xmm9,%xmm1 - pxor %xmm6,%xmm12 - - pxor %xmm14,%xmm5 - pxor %xmm13,%xmm3 - pxor %xmm13,%xmm1 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm14,%xmm1 - pxor %xmm14,%xmm6 - pxor %xmm12,%xmm4 - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm5,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm3,%xmm10 - pxor %xmm9,%xmm5 - pshufd $0x93,%xmm1,%xmm11 - pxor %xmm10,%xmm3 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm1 - pshufd $0x93,%xmm2,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm2 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm1,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm3,%xmm11 - pshufd $0x4E,%xmm1,%xmm7 - pxor %xmm2,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm5,%xmm10 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm2,%xmm3 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm12,%xmm8 - pxor %xmm1,%xmm10 - pxor %xmm14,%xmm6 - pxor %xmm3,%xmm13 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm2 - movdqa %xmm13,%xmm5 - movdqa %xmm8,%xmm4 - movdqa %xmm2,%xmm1 - movdqa %xmm10,%xmm2 - movdqa -16(%r11),%xmm7 - jnz .Ldec_loop - movdqa -32(%r11),%xmm7 - jmp .Ldec_loop -.align 16 -.Ldec_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm2,%xmm9 - psrlq $1,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $1,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm6,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm4 - psllq $1,%xmm2 - pxor %xmm1,%xmm6 - psllq $1,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm3,%xmm5 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm5 - pand %xmm7,%xmm15 - pxor %xmm5,%xmm3 - psllq $1,%xmm5 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm1,%xmm10 - psrlq $2,%xmm1 - pxor %xmm4,%xmm6 - pxor %xmm2,%xmm1 - pand %xmm8,%xmm6 - pand %xmm8,%xmm1 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm1,%xmm2 - psllq $2,%xmm1 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm3 - psllq $2,%xmm0 - pxor %xmm15,%xmm5 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm3,%xmm9 - psrlq $4,%xmm3 - movdqa %xmm5,%xmm10 - psrlq $4,%xmm5 - pxor %xmm4,%xmm3 - pxor %xmm2,%xmm5 - pand %xmm7,%xmm3 - pand %xmm7,%xmm5 - pxor %xmm3,%xmm4 - psllq $4,%xmm3 - pxor %xmm5,%xmm2 - psllq $4,%xmm5 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm5 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm1 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 -.size _bsaes_decrypt8,.-_bsaes_decrypt8 -.type _bsaes_key_convert,@function -.align 16 -_bsaes_key_convert: - leaq .Lmasks(%rip),%r11 - movdqu (%rcx),%xmm7 - leaq 16(%rcx),%rcx - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - movdqa 48(%r11),%xmm3 - movdqa 64(%r11),%xmm4 - pcmpeqd %xmm5,%xmm5 - - movdqu (%rcx),%xmm6 - movdqa %xmm7,(%rax) - leaq 16(%rax),%rax - decl %r10d - jmp .Lkey_loop -.align 16 -.Lkey_loop: -.byte 102,15,56,0,244 - - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - - pand %xmm6,%xmm8 - pand %xmm6,%xmm9 - movdqa %xmm2,%xmm10 - pcmpeqb %xmm0,%xmm8 - psllq $4,%xmm0 - movdqa %xmm3,%xmm11 - pcmpeqb %xmm1,%xmm9 - psllq $4,%xmm1 - - pand %xmm6,%xmm10 - pand %xmm6,%xmm11 - movdqa %xmm0,%xmm12 - pcmpeqb %xmm2,%xmm10 - psllq $4,%xmm2 - movdqa %xmm1,%xmm13 - pcmpeqb %xmm3,%xmm11 - psllq $4,%xmm3 - - movdqa %xmm2,%xmm14 - movdqa %xmm3,%xmm15 - pxor %xmm5,%xmm8 - pxor %xmm5,%xmm9 - - pand %xmm6,%xmm12 - pand %xmm6,%xmm13 - movdqa %xmm8,0(%rax) - pcmpeqb %xmm0,%xmm12 - psrlq $4,%xmm0 - movdqa %xmm9,16(%rax) - pcmpeqb %xmm1,%xmm13 - psrlq $4,%xmm1 - leaq 16(%rcx),%rcx - - pand %xmm6,%xmm14 - pand %xmm6,%xmm15 - movdqa %xmm10,32(%rax) - pcmpeqb %xmm2,%xmm14 - psrlq $4,%xmm2 - movdqa %xmm11,48(%rax) - pcmpeqb %xmm3,%xmm15 - psrlq $4,%xmm3 - movdqu (%rcx),%xmm6 - - pxor %xmm5,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm12,64(%rax) - movdqa %xmm13,80(%rax) - movdqa %xmm14,96(%rax) - movdqa %xmm15,112(%rax) - leaq 128(%rax),%rax - decl %r10d - jnz .Lkey_loop - - movdqa 80(%r11),%xmm7 - - .byte 0xf3,0xc3 -.size _bsaes_key_convert,.-_bsaes_key_convert -.extern asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.globl bsaes_cbc_encrypt -.hidden bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,@function -.align 16 -bsaes_cbc_encrypt: - cmpl $0,%r9d - jne asm_AES_cbc_encrypt - cmpq $128,%rdx - jb asm_AES_cbc_encrypt - - movq %rsp,%rax -.Lcbc_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movq %r8,%rbx - shrq $4,%r14 - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - movdqu (%rbx),%xmm14 - subq $8,%r14 -.Lcbc_dec_loop: - movdqu 0(%r12),%xmm15 - movdqu 16(%r12),%xmm0 - movdqu 32(%r12),%xmm1 - movdqu 48(%r12),%xmm2 - movdqu 64(%r12),%xmm3 - movdqu 80(%r12),%xmm4 - movq %rsp,%rax - movdqu 96(%r12),%xmm5 - movl %edx,%r10d - movdqu 112(%r12),%xmm6 - movdqa %xmm14,32(%rbp) - - call _bsaes_decrypt8 - - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm2 - movdqu 112(%r12),%xmm14 - pxor %xmm13,%xmm4 - movdqu %xmm15,0(%r13) - leaq 128(%r12),%r12 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - subq $8,%r14 - jnc .Lcbc_dec_loop - - addq $8,%r14 - jz .Lcbc_dec_done - - movdqu 0(%r12),%xmm15 - movq %rsp,%rax - movl %edx,%r10d - cmpq $2,%r14 - jb .Lcbc_dec_one - movdqu 16(%r12),%xmm0 - je .Lcbc_dec_two - movdqu 32(%r12),%xmm1 - cmpq $4,%r14 - jb .Lcbc_dec_three - movdqu 48(%r12),%xmm2 - je .Lcbc_dec_four - movdqu 64(%r12),%xmm3 - cmpq $6,%r14 - jb .Lcbc_dec_five - movdqu 80(%r12),%xmm4 - je .Lcbc_dec_six - movdqu 96(%r12),%xmm5 - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm14 - pxor %xmm12,%xmm2 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_six: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm14 - pxor %xmm11,%xmm6 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_five: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm14 - pxor %xmm10,%xmm1 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_four: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm14 - pxor %xmm9,%xmm3 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_three: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm14 - pxor %xmm8,%xmm5 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_two: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm14 - pxor %xmm7,%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_one: - leaq (%r12),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm14 - movdqu %xmm14,(%r13) - movdqa %xmm15,%xmm14 - -.Lcbc_dec_done: - movdqu %xmm14,(%rbx) - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lcbc_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lcbc_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lcbc_dec_epilogue: - .byte 0xf3,0xc3 -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - -.globl bsaes_ctr32_encrypt_blocks -.hidden bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,@function -.align 16 -bsaes_ctr32_encrypt_blocks: - movq %rsp,%rax -.Lctr_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movdqu (%r8),%xmm0 - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movdqa %xmm0,32(%rbp) - cmpq $8,%rdx - jb .Lctr_enc_short - - movl %eax,%ebx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %ebx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - movdqa (%rsp),%xmm8 - leaq .LADD1(%rip),%r11 - movdqa 32(%rbp),%xmm15 - movdqa -32(%r11),%xmm7 -.byte 102,68,15,56,0,199 -.byte 102,68,15,56,0,255 - movdqa %xmm8,(%rsp) - jmp .Lctr_enc_loop -.align 16 -.Lctr_enc_loop: - movdqa %xmm15,32(%rbp) - movdqa %xmm15,%xmm0 - movdqa %xmm15,%xmm1 - paddd 0(%r11),%xmm0 - movdqa %xmm15,%xmm2 - paddd 16(%r11),%xmm1 - movdqa %xmm15,%xmm3 - paddd 32(%r11),%xmm2 - movdqa %xmm15,%xmm4 - paddd 48(%r11),%xmm3 - movdqa %xmm15,%xmm5 - paddd 64(%r11),%xmm4 - movdqa %xmm15,%xmm6 - paddd 80(%r11),%xmm5 - paddd 96(%r11),%xmm6 - - - - movdqa (%rsp),%xmm8 - leaq 16(%rsp),%rax - movdqa -16(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq .LBS0(%rip),%r11 - movl %ebx,%r10d - - call _bsaes_encrypt8_bitslice - - subq $8,%r14 - jc .Lctr_enc_loop_done - - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - movdqu 32(%r12),%xmm9 - movdqu 48(%r12),%xmm10 - movdqu 64(%r12),%xmm11 - movdqu 80(%r12),%xmm12 - movdqu 96(%r12),%xmm13 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - pxor %xmm15,%xmm7 - movdqa 32(%rbp),%xmm15 - pxor %xmm8,%xmm0 - movdqu %xmm7,0(%r13) - pxor %xmm9,%xmm3 - movdqu %xmm0,16(%r13) - pxor %xmm10,%xmm5 - movdqu %xmm3,32(%r13) - pxor %xmm11,%xmm2 - movdqu %xmm5,48(%r13) - pxor %xmm12,%xmm6 - movdqu %xmm2,64(%r13) - pxor %xmm13,%xmm1 - movdqu %xmm6,80(%r13) - pxor %xmm14,%xmm4 - movdqu %xmm1,96(%r13) - leaq .LADD1(%rip),%r11 - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - paddd 112(%r11),%xmm15 - jnz .Lctr_enc_loop - - jmp .Lctr_enc_done -.align 16 -.Lctr_enc_loop_done: - addq $8,%r14 - movdqu 0(%r12),%xmm7 - pxor %xmm7,%xmm15 - movdqu %xmm15,0(%r13) - cmpq $2,%r14 - jb .Lctr_enc_done - movdqu 16(%r12),%xmm8 - pxor %xmm8,%xmm0 - movdqu %xmm0,16(%r13) - je .Lctr_enc_done - movdqu 32(%r12),%xmm9 - pxor %xmm9,%xmm3 - movdqu %xmm3,32(%r13) - cmpq $4,%r14 - jb .Lctr_enc_done - movdqu 48(%r12),%xmm10 - pxor %xmm10,%xmm5 - movdqu %xmm5,48(%r13) - je .Lctr_enc_done - movdqu 64(%r12),%xmm11 - pxor %xmm11,%xmm2 - movdqu %xmm2,64(%r13) - cmpq $6,%r14 - jb .Lctr_enc_done - movdqu 80(%r12),%xmm12 - pxor %xmm12,%xmm6 - movdqu %xmm6,80(%r13) - je .Lctr_enc_done - movdqu 96(%r12),%xmm13 - pxor %xmm13,%xmm1 - movdqu %xmm1,96(%r13) - jmp .Lctr_enc_done - -.align 16 -.Lctr_enc_short: - leaq 32(%rbp),%rdi - leaq 48(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_encrypt - movdqu (%r12),%xmm0 - leaq 16(%r12),%r12 - movl 44(%rbp),%eax - bswapl %eax - pxor 48(%rbp),%xmm0 - incl %eax - movdqu %xmm0,(%r13) - bswapl %eax - leaq 16(%r13),%r13 - movl %eax,44(%rsp) - decq %r14 - jnz .Lctr_enc_short - -.Lctr_enc_done: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lctr_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lctr_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lctr_enc_epilogue: - .byte 0xf3,0xc3 -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.hidden bsaes_xts_encrypt -.type bsaes_xts_encrypt,@function -.align 16 -bsaes_xts_encrypt: - movq %rsp,%rax -.Lxts_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - andq $-16,%r14 - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc .Lxts_enc_short - jmp .Lxts_enc_loop - -.align 16 -.Lxts_enc_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm1,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc .Lxts_enc_loop - -.Lxts_enc_short: - addq $0x80,%r14 - jz .Lxts_enc_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je .Lxts_enc_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je .Lxts_enc_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je .Lxts_enc_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je .Lxts_enc_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je .Lxts_enc_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je .Lxts_enc_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - movdqu %xmm1,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - movdqu %xmm2,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - movdqu %xmm5,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - movdqu %xmm3,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_encrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -.Lxts_enc_done: - andl $15,%ebx - jz .Lxts_enc_ret - movq %r13,%rdx - -.Lxts_enc_steal: - movzbl (%r12),%eax - movzbl -16(%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,-16(%rdx) - movb %cl,0(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz .Lxts_enc_steal - - movdqu -16(%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call asm_AES_encrypt - pxor 32(%rbp),%xmm6 - movdqu %xmm6,-16(%r13) - -.Lxts_enc_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lxts_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lxts_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lxts_enc_epilogue: - .byte 0xf3,0xc3 -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.hidden bsaes_xts_decrypt -.type bsaes_xts_decrypt,@function -.align 16 -bsaes_xts_decrypt: - movq %rsp,%rax -.Lxts_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - xorl %eax,%eax - andq $-16,%r14 - testl $15,%ebx - setnz %al - shlq $4,%rax - subq %rax,%r14 - - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc .Lxts_dec_short - jmp .Lxts_dec_loop - -.align 16 -.Lxts_dec_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc .Lxts_dec_loop - -.Lxts_dec_short: - addq $0x80,%r14 - jz .Lxts_dec_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je .Lxts_dec_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je .Lxts_dec_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je .Lxts_dec_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je .Lxts_dec_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je .Lxts_dec_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je .Lxts_dec_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -.Lxts_dec_done: - andl $15,%ebx - jz .Lxts_dec_ret - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - movdqa %xmm6,%xmm5 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - movdqu (%r12),%xmm15 - pxor %xmm13,%xmm6 - - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm6 - movq %r13,%rdx - movdqu %xmm6,(%r13) - -.Lxts_dec_steal: - movzbl 16(%r12),%eax - movzbl (%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,(%rdx) - movb %cl,16(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz .Lxts_dec_steal - - movdqu (%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm5,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm5 - movdqu %xmm5,(%r13) - -.Lxts_dec_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lxts_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lxts_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lxts_dec_epilogue: - .byte 0xf3,0xc3 -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt -.type _bsaes_const,@object -.align 64 -_bsaes_const: -.LM0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -.LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -.LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -.LBS0: -.quad 0x5555555555555555, 0x5555555555555555 -.LBS1: -.quad 0x3333333333333333, 0x3333333333333333 -.LBS2: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -.LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -.LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -.LM0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LSWPUP: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -.LSWPUPM0SR: -.quad 0x0a0d02060c03070b, 0x0004080f05090e01 -.LADD1: -.quad 0x0000000000000000, 0x0000000100000000 -.LADD2: -.quad 0x0000000000000000, 0x0000000200000000 -.LADD3: -.quad 0x0000000000000000, 0x0000000300000000 -.LADD4: -.quad 0x0000000000000000, 0x0000000400000000 -.LADD5: -.quad 0x0000000000000000, 0x0000000500000000 -.LADD6: -.quad 0x0000000000000000, 0x0000000600000000 -.LADD7: -.quad 0x0000000000000000, 0x0000000700000000 -.LADD8: -.quad 0x0000000000000000, 0x0000000800000000 -.Lxts_magic: -.long 0x87,0,1,0 -.Lmasks: -.quad 0x0101010101010101, 0x0101010101010101 -.quad 0x0202020202020202, 0x0202020202020202 -.quad 0x0404040404040404, 0x0404040404040404 -.quad 0x0808080808080808, 0x0808080808080808 -.LM0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -.L63: -.quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 -.align 64 -.size _bsaes_const,.-_bsaes_const -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S new file mode 100644 index 0000000000..a44790b169 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S @@ -0,0 +1,427 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + +.type gcm_gmult_ssse3, @function +.globl gcm_gmult_ssse3 +.hidden gcm_gmult_ssse3 +.align 16 +gcm_gmult_ssse3: +.cfi_startproc +.Lgmult_seh_begin: + movdqu (%rdi),%xmm0 + movdqa .Lreverse_bytes(%rip),%xmm10 + movdqa .Llow4_mask(%rip),%xmm2 + + +.byte 102,65,15,56,0,194 + + + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + + + + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_1: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_1 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_2: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_2 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +.Loop_row_3: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_3 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + +.byte 102,65,15,56,0,210 + movdqu %xmm2,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.Lgmult_seh_end: +.cfi_endproc +.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 + + + + + +.type gcm_ghash_ssse3, @function +.globl gcm_ghash_ssse3 +.hidden gcm_ghash_ssse3 +.align 16 +gcm_ghash_ssse3: +.Lghash_seh_begin: +.cfi_startproc + movdqu (%rdi),%xmm0 + movdqa .Lreverse_bytes(%rip),%xmm10 + movdqa .Llow4_mask(%rip),%xmm11 + + + andq $-16,%rcx + + + +.byte 102,65,15,56,0,194 + + + pxor %xmm3,%xmm3 +.Loop_ghash: + + movdqu (%rdx),%xmm1 +.byte 102,65,15,56,0,202 + pxor %xmm1,%xmm0 + + + movdqa %xmm11,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm11,%xmm0 + + + + + pxor %xmm2,%xmm2 + + movq $5,%rax +.Loop_row_4: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_4 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_5: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_5 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +.Loop_row_6: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_6 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + + + leaq -256(%rsi),%rsi + + + leaq 16(%rdx),%rdx + subq $16,%rcx + jnz .Loop_ghash + + +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.Lghash_seh_end: +.cfi_endproc +.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 + +.align 16 + + +.Lreverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +.Llow4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S index 64ef2c2db2..674e2dabed 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8,13 +20,27 @@ .type gcm_gmult_4bit,@function .align 16 gcm_gmult_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -92,23 +118,41 @@ gcm_gmult_4bit: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit .globl gcm_ghash_4bit .hidden gcm_ghash_4bit .type gcm_ghash_4bit,@function .align 16 gcm_ghash_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -654,21 +698,31 @@ gcm_ghash_4bit: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit .globl gcm_init_clmul .hidden gcm_init_clmul .type gcm_init_clmul,@function .align 16 gcm_init_clmul: +.cfi_startproc .L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -820,12 +874,14 @@ gcm_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .hidden gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: +.cfi_startproc .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -872,12 +928,14 @@ gcm_gmult_clmul: .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .hidden gcm_ghash_clmul .type gcm_ghash_clmul,@function .align 32 gcm_ghash_clmul: +.cfi_startproc .L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm10 @@ -1257,12 +1315,14 @@ gcm_ghash_clmul: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul .globl gcm_init_avx .hidden gcm_init_avx .type gcm_init_avx,@function .align 32 gcm_init_avx: +.cfi_startproc vzeroupper vmovdqu (%rsi),%xmm2 @@ -1365,19 +1425,23 @@ gcm_init_avx: vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx .hidden gcm_gmult_avx .type gcm_gmult_avx,@function .align 32 gcm_gmult_avx: +.cfi_startproc jmp .L_gmult_clmul +.cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx .globl gcm_ghash_avx .hidden gcm_ghash_avx .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: +.cfi_startproc vzeroupper vmovdqu (%rdi),%xmm10 @@ -1749,6 +1813,7 @@ gcm_ghash_avx: vmovdqu %xmm10,(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: @@ -1804,3 +1869,4 @@ gcm_ghash_avx: .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S index 8af65047aa..04aaf057e6 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .align 16 @@ -6,11 +18,22 @@ .hidden md5_block_asm_data_order .type md5_block_asm_data_order,@function md5_block_asm_data_order: +.cfi_startproc pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-40 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-48 .Lprologue: @@ -660,12 +683,20 @@ md5_block_asm_data_order: movl %edx,12(%rbp) movq (%rsp),%r15 +.cfi_restore r15 movq 8(%rsp),%r14 +.cfi_restore r14 movq 16(%rsp),%r12 +.cfi_restore r12 movq 24(%rsp),%rbx +.cfi_restore rbx movq 32(%rsp),%rbp +.cfi_restore rbp addq $40,%rsp +.cfi_adjust_cfa_offset -40 .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size md5_block_asm_data_order,.-md5_block_asm_data_order #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 6d21888f04..85f4899012 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -18,14 +30,26 @@ .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f + + .globl ecp_nistz256_neg .hidden ecp_nistz256_neg .type ecp_nistz256_neg,@function .align 32 ecp_nistz256_neg: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lneg_body: xorq %r8,%r8 xorq %r9,%r9 @@ -59,9 +83,15 @@ ecp_nistz256_neg: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg @@ -69,18 +99,1130 @@ ecp_nistz256_neg: +.globl ecp_nistz256_ord_mul_mont +.hidden ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,@function +.align 32 +ecp_nistz256_ord_mul_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_mul_montx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq .Lord(%rip),%r14 + movq .LordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + + + + + + + +.globl ecp_nistz256_ord_sqr_mont +.hidden ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,@function +.align 32 +ecp_nistz256_ord_sqr_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_sqr_montx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq .Lord(%rip),%rsi + movq %rdx,%rbx + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz .Loop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont + +.type ecp_nistz256_ord_mul_montx,@function +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,@function +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx + + + + + + .globl ecp_nistz256_mul_mont .hidden ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,@function .align 32 ecp_nistz256_mul_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx .Lmul_mont: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmul_body: + cmpl $0x80100,%ecx + je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -89,19 +1231,43 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp .Lmul_mont_done + +.align 32 +.Lmul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx .Lmul_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,@function .align 32 __ecp_nistz256_mul_montq: +.cfi_startproc movq %rax,%rbp @@ -313,6 +1479,7 @@ __ecp_nistz256_mul_montq: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq @@ -327,31 +1494,72 @@ __ecp_nistz256_mul_montq: .type ecp_nistz256_sqr_mont,@function .align 32 ecp_nistz256_sqr_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqr_body: + cmpl $0x80100,%ecx + je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp .Lsqr_mont_done + +.align 32 +.Lsqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx .Lsqr_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,@function .align 32 __ecp_nistz256_sqr_montq: +.cfi_startproc movq %rax,%r13 mulq %r14 movq %rax,%r9 @@ -509,7 +1717,306 @@ __ecp_nistz256_sqr_montq: movq %r15,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx .globl ecp_nistz256_select_w5 @@ -517,6 +2024,7 @@ __ecp_nistz256_sqr_montq: .type ecp_nistz256_select_w5,@function .align 32 ecp_nistz256_select_w5: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -572,6 +2080,8 @@ ecp_nistz256_select_w5: movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5: .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 @@ -581,6 +2091,7 @@ ecp_nistz256_select_w5: .type ecp_nistz256_select_w7,@function .align 32 ecp_nistz256_select_w7: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -625,12 +2136,15 @@ ecp_nistz256_select_w7: movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7: .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 .type ecp_nistz256_avx2_select_w5,@function .align 32 ecp_nistz256_avx2_select_w5: +.cfi_startproc .Lavx2_select_w5: vzeroupper vmovdqa .LTwo(%rip),%ymm0 @@ -685,6 +2199,8 @@ ecp_nistz256_avx2_select_w5: vmovdqu %ymm4,64(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_select_w5: .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 @@ -694,6 +2210,7 @@ ecp_nistz256_avx2_select_w5: .type ecp_nistz256_avx2_select_w7,@function .align 32 ecp_nistz256_avx2_select_w7: +.cfi_startproc .Lavx2_select_w7: vzeroupper vmovdqa .LThree(%rip),%ymm0 @@ -763,10 +2280,13 @@ ecp_nistz256_avx2_select_w7: vmovdqu %ymm3,32(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_select_w7: .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 .type __ecp_nistz256_add_toq,@function .align 32 __ecp_nistz256_add_toq: +.cfi_startproc xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 @@ -794,11 +2314,13 @@ __ecp_nistz256_add_toq: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,@function .align 32 __ecp_nistz256_sub_fromq: +.cfi_startproc subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax @@ -825,11 +2347,13 @@ __ecp_nistz256_sub_fromq: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,@function .align 32 __ecp_nistz256_subq: +.cfi_startproc subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 @@ -852,11 +2376,13 @@ __ecp_nistz256_subq: cmovnzq %r10,%r9 .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,@function .align 32 __ecp_nistz256_mul_by_2q: +.cfi_startproc xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 @@ -884,19 +2410,40 @@ __ecp_nistz256_mul_by_2q: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q .globl ecp_nistz256_point_double .hidden ecp_nistz256_point_double .type ecp_nistz256_point_double,@function .align 32 ecp_nistz256_point_double: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_doublex pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doubleq_body: .Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 @@ -1079,27 +2626,58 @@ ecp_nistz256_point_double: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doubleq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_double,.-ecp_nistz256_point_double .globl ecp_nistz256_point_add .hidden ecp_nistz256_point_add .type ecp_nistz256_point_add,@function .align 32 ecp_nistz256_point_add: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_addx pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 @@ -1245,15 +2823,22 @@ ecp_nistz256_point_add: orq %r8,%r12 orq %r9,%r12 -.byte 0x3e - jnz .Ladd_proceedq .byte 102,73,15,126,208 .byte 102,73,15,126,217 - testq %r8,%r8 + orq %r8,%r12 +.byte 0x3e jnz .Ladd_proceedq + + + testq %r9,%r9 jz .Ladd_doubleq + + + + + .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) @@ -1269,7 +2854,9 @@ ecp_nistz256_point_add: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp +.cfi_adjust_cfa_offset -416 jmp .Lpoint_double_shortcutq +.cfi_adjust_cfa_offset 416 .align 32 .Ladd_proceedq: @@ -1476,27 +3063,58 @@ ecp_nistz256_point_add: movdqu %xmm3,48(%rdi) .Ladd_doneq: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add,.-ecp_nistz256_point_add .globl ecp_nistz256_point_add_affine .hidden ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,@function .align 32 ecp_nistz256_point_add_affine: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_add_affinex pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx @@ -1779,13 +3397,1147 @@ ecp_nistz256_point_add_affine: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affineq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.type __ecp_nistz256_add_tox,@function +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,@function +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,@function +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,@function +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.type ecp_nistz256_point_doublex,@function +.align 32 +ecp_nistz256_point_doublex: +.cfi_startproc +.Lpoint_doublex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: + +.Lpoint_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex +.type ecp_nistz256_point_addx,@function +.align 32 +ecp_nistz256_point_addx: +.cfi_startproc +.Lpoint_addx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz .Ladd_proceedx + + + + testq %r9,%r9 + jz .Ladd_doublex + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_donex + +.align 32 +.Ladd_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutx +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_donex: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx +.type ecp_nistz256_point_add_affinex,@function +.align 32 +ecp_nistz256_point_add_affinex: +.cfi_startproc +.Lpoint_add_affinex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S new file mode 100644 index 0000000000..d072a83479 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S @@ -0,0 +1,343 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.type beeu_mod_inverse_vartime,@function +.hidden beeu_mod_inverse_vartime +.globl beeu_mod_inverse_vartime +.hidden beeu_mod_inverse_vartime +.align 32 +beeu_mod_inverse_vartime: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-16 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-24 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-32 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-40 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-48 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 +.cfi_offset rsi,-64 + + subq $80,%rsp +.cfi_adjust_cfa_offset 80 + movq %rdi,0(%rsp) + + + movq $1,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %rdi,%rdi + + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + xorq %rbp,%rbp + + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu %xmm0,48(%rsp) + vmovdqu %xmm1,64(%rsp) + + vmovdqu 0(%rdx),%xmm0 + vmovdqu 16(%rdx),%xmm1 + vmovdqu %xmm0,16(%rsp) + vmovdqu %xmm1,32(%rsp) + +.Lbeeu_loop: + xorq %rbx,%rbx + orq 48(%rsp),%rbx + orq 56(%rsp),%rbx + orq 64(%rsp),%rbx + orq 72(%rsp),%rbx + jz .Lbeeu_loop_end + + + + + + + + + + + movq $1,%rcx + + +.Lbeeu_shift_loop_XB: + movq %rcx,%rbx + andq 48(%rsp),%rbx + jnz .Lbeeu_shift_loop_end_XB + + + movq $1,%rbx + andq %r8,%rbx + jz .Lshift1_0 + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq $0,%rdi + +.Lshift1_0: + shrdq $1,%r9,%r8 + shrdq $1,%r10,%r9 + shrdq $1,%r11,%r10 + shrdq $1,%rdi,%r11 + shrq $1,%rdi + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne .Lbeeu_shift_loop_XB + +.Lbeeu_shift_loop_end_XB: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz .Lbeeu_no_shift_XB + + + + movq 8+48(%rsp),%rax + movq 16+48(%rsp),%rbx + movq 24+48(%rsp),%rsi + + shrdq %cl,%rax,0+48(%rsp) + shrdq %cl,%rbx,8+48(%rsp) + shrdq %cl,%rsi,16+48(%rsp) + + shrq %cl,%rsi + movq %rsi,24+48(%rsp) + + +.Lbeeu_no_shift_XB: + + movq $1,%rcx + + +.Lbeeu_shift_loop_YA: + movq %rcx,%rbx + andq 16(%rsp),%rbx + jnz .Lbeeu_shift_loop_end_YA + + + movq $1,%rbx + andq %r12,%rbx + jz .Lshift1_1 + addq 0(%rdx),%r12 + adcq 8(%rdx),%r13 + adcq 16(%rdx),%r14 + adcq 24(%rdx),%r15 + adcq $0,%rbp + +.Lshift1_1: + shrdq $1,%r13,%r12 + shrdq $1,%r14,%r13 + shrdq $1,%r15,%r14 + shrdq $1,%rbp,%r15 + shrq $1,%rbp + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne .Lbeeu_shift_loop_YA + +.Lbeeu_shift_loop_end_YA: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz .Lbeeu_no_shift_YA + + + + movq 8+16(%rsp),%rax + movq 16+16(%rsp),%rbx + movq 24+16(%rsp),%rsi + + shrdq %cl,%rax,0+16(%rsp) + shrdq %cl,%rbx,8+16(%rsp) + shrdq %cl,%rsi,16+16(%rsp) + + shrq %cl,%rsi + movq %rsi,24+16(%rsp) + + +.Lbeeu_no_shift_YA: + + movq 48(%rsp),%rax + movq 56(%rsp),%rbx + movq 64(%rsp),%rsi + movq 72(%rsp),%rcx + subq 16(%rsp),%rax + sbbq 24(%rsp),%rbx + sbbq 32(%rsp),%rsi + sbbq 40(%rsp),%rcx + jnc .Lbeeu_B_bigger_than_A + + + movq 16(%rsp),%rax + movq 24(%rsp),%rbx + movq 32(%rsp),%rsi + movq 40(%rsp),%rcx + subq 48(%rsp),%rax + sbbq 56(%rsp),%rbx + sbbq 64(%rsp),%rsi + sbbq 72(%rsp),%rcx + movq %rax,16(%rsp) + movq %rbx,24(%rsp) + movq %rsi,32(%rsp) + movq %rcx,40(%rsp) + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + adcq %rdi,%rbp + jmp .Lbeeu_loop + +.Lbeeu_B_bigger_than_A: + + movq %rax,48(%rsp) + movq %rbx,56(%rsp) + movq %rsi,64(%rsp) + movq %rcx,72(%rsp) + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rbp,%rdi + + jmp .Lbeeu_loop + +.Lbeeu_loop_end: + + + + + movq 16(%rsp),%rbx + subq $1,%rbx + orq 24(%rsp),%rbx + orq 32(%rsp),%rbx + orq 40(%rsp),%rbx + + jnz .Lbeeu_err + + + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + xorq %rdi,%rdi + +.Lbeeu_reduction_loop: + movq %r12,16(%rsp) + movq %r13,24(%rsp) + movq %r14,32(%rsp) + movq %r15,40(%rsp) + movq %rbp,48(%rsp) + + + subq %r8,%r12 + sbbq %r9,%r13 + sbbq %r10,%r14 + sbbq %r11,%r15 + sbbq $0,%rbp + + + cmovcq 16(%rsp),%r12 + cmovcq 24(%rsp),%r13 + cmovcq 32(%rsp),%r14 + cmovcq 40(%rsp),%r15 + jnc .Lbeeu_reduction_loop + + + subq %r12,%r8 + sbbq %r13,%r9 + sbbq %r14,%r10 + sbbq %r15,%r11 + +.Lbeeu_save: + + movq 0(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + + movq $1,%rax + jmp .Lbeeu_finish + +.Lbeeu_err: + + xorq %rax,%rax + +.Lbeeu_finish: + addq $80,%rsp +.cfi_adjust_cfa_offset -80 + popq %rsi +.cfi_adjust_cfa_offset -8 +.cfi_restore rsi + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore rbx + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore rbp + .byte 0xf3,0xc3 +.cfi_endproc + +.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S index 7c1eeb7211..18d66f6f7f 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -9,14 +21,15 @@ .type CRYPTO_rdrand,@function .align 16 CRYPTO_rdrand: +.cfi_startproc xorq %rax,%rax - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,242 adcq %rax,%rax - movq %rcx,0(%rdi) + movq %rdx,0(%rdi) .byte 0xf3,0xc3 +.cfi_endproc +.size CRYPTO_rdrand,.-CRYPTO_rdrand @@ -27,13 +40,12 @@ CRYPTO_rdrand: .type CRYPTO_rdrand_multiple8_buf,@function .align 16 CRYPTO_rdrand_multiple8_buf: +.cfi_startproc testq %rsi,%rsi jz .Lout movq $8,%rdx .Lloop: - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,241 jnc .Lerr movq %rcx,0(%rdi) addq %rdx,%rdi @@ -45,4 +57,7 @@ CRYPTO_rdrand_multiple8_buf: .Lerr: xorq %rax,%rax .byte 0xf3,0xc3 +.cfi_endproc +.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S index bc3440d55c..faccd484b0 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl rsaz_1024_sqr_avx2 @@ -77,7 +89,7 @@ rsaz_1024_sqr_avx2: vmovdqu 256-128(%rsi),%ymm8 leaq 192(%rsp),%rbx - vpbroadcastq .Land_mask(%rip),%ymm15 + vmovdqu .Land_mask(%rip),%ymm15 jmp .LOOP_GRANDE_SQR_1024 .align 32 @@ -829,10 +841,10 @@ rsaz_1024_mul_avx2: vpmuludq 192-128(%rcx),%ymm11,%ymm12 vpaddq %ymm12,%ymm6,%ymm6 vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 + vpblendd $3,%ymm14,%ymm9,%ymm12 vpaddq %ymm13,%ymm7,%ymm7 vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm12,%ymm3,%ymm3 vpaddq %ymm0,%ymm8,%ymm8 movq %rbx,%rax @@ -845,7 +857,9 @@ rsaz_1024_mul_avx2: vmovdqu -8+64-128(%rsi),%ymm13 movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 andl $0x1fffffff,%eax imulq 16-128(%rsi),%rbx @@ -1074,7 +1088,6 @@ rsaz_1024_mul_avx2: decl %r14d jnz .Loop_mul_1024 - vpermq $0,%ymm15,%ymm15 vpaddq (%rsp),%ymm12,%ymm0 vpsrlq $29,%ymm0,%ymm12 @@ -1215,6 +1228,7 @@ rsaz_1024_mul_avx2: .type rsaz_1024_red2norm_avx2,@function .align 32 rsaz_1024_red2norm_avx2: +.cfi_startproc subq $-128,%rsi xorq %rax,%rax movq -128(%rsi),%r8 @@ -1406,6 +1420,7 @@ rsaz_1024_red2norm_avx2: movq %rax,120(%rdi) movq %r11,%rax .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 .globl rsaz_1024_norm2red_avx2 @@ -1413,6 +1428,7 @@ rsaz_1024_red2norm_avx2: .type rsaz_1024_norm2red_avx2,@function .align 32 rsaz_1024_norm2red_avx2: +.cfi_startproc subq $-128,%rdi movq (%rsi),%r8 movl $0x1fffffff,%eax @@ -1565,12 +1581,14 @@ rsaz_1024_norm2red_avx2: movq %r8,176(%rdi) movq %r8,184(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 .globl rsaz_1024_scatter5_avx2 .hidden rsaz_1024_scatter5_avx2 .type rsaz_1024_scatter5_avx2,@function .align 32 rsaz_1024_scatter5_avx2: +.cfi_startproc vzeroupper vmovdqu .Lscatter_permd(%rip),%ymm5 shll $4,%edx @@ -1590,6 +1608,7 @@ rsaz_1024_scatter5_avx2: vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 .globl rsaz_1024_gather5_avx2 @@ -1714,23 +1733,9 @@ rsaz_1024_gather5_avx2: .cfi_endproc .LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl rsaz_avx2_eligible -.hidden rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -.align 32 -rsaz_avx2_eligible: - leaq OPENSSL_ia32cap_P(%rip),%rax - movl 8(%rax),%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - .align 64 .Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff .Lscatter_permd: .long 0,2,4,6,7,7,7,7 .Lgather_permd: @@ -1741,3 +1746,4 @@ rsaz_avx2_eligible: .long 4,4,4,4, 4,4,4,4 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S index 7f924dcc1e..a4ce81ff91 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8,6 +20,7 @@ .type sha1_block_data_order,@function .align 16 sha1_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r10 movl 0(%r10),%r9d movl 4(%r10),%r8d @@ -24,17 +37,24 @@ sha1_block_data_order: .align 16 .Lialu: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%r8),%esi @@ -1229,25 +1249,40 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: +.cfi_startproc movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp andq $-64,%rsp movq %rdi,%r8 @@ -2404,24 +2439,38 @@ _ssse3_shortcut: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 +.cfi_restore %r14 movq -32(%r11),%r13 +.cfi_restore %r13 movq -24(%r11),%r12 +.cfi_restore %r12 movq -16(%r11),%rbp +.cfi_restore %rbp movq -8(%r11),%rbx +.cfi_restore %rbx leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .type sha1_block_data_order_avx,@function .align 16 sha1_block_data_order_avx: _avx_shortcut: +.cfi_startproc movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp vzeroupper andq $-64,%rsp @@ -3518,13 +3567,20 @@ _avx_shortcut: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 +.cfi_restore %r14 movq -32(%r11),%r13 +.cfi_restore %r13 movq -24(%r11),%r12 +.cfi_restore %r12 movq -16(%r11),%rbp +.cfi_restore %rbp movq -8(%r11),%rbx +.cfi_restore %rbx leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx .align 64 K_XX_XX: @@ -3542,3 +3598,4 @@ K_XX_XX: .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S index 62534be495..0bacd6a4a8 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -8,6 +20,7 @@ .type sha256_block_data_order,@function .align 16 sha256_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -20,12 +33,19 @@ sha256_block_data_order: testl $512,%r10d jnz .Lssse3_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -33,7 +53,8 @@ sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%rdi),%eax @@ -1697,16 +1718,25 @@ sha256_block_data_order: movl %r11d,28(%rdi) jb .Lloop - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order,.-sha256_block_data_order .align 64 .type K256,@object @@ -1754,14 +1784,22 @@ K256: .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: +.cfi_startproc .Lssse3_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1769,7 +1807,8 @@ sha256_block_data_order_ssse3: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_ssse3: movl 0(%rdi),%eax @@ -2835,28 +2874,45 @@ sha256_block_data_order_ssse3: movl %r11d,28(%rdi) jb .Lloop_ssse3 - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 .type sha256_block_data_order_avx,@function .align 64 sha256_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2864,7 +2920,8 @@ sha256_block_data_order_avx: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -3891,16 +3948,26 @@ sha256_block_data_order_avx: movl %r11d,28(%rdi) jb .Lloop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S index 1f1793bb0f..afc47f139b 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -8,24 +20,30 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $2048,%r10d - jnz .Lxop_shortcut andl $1073741824,%r9d andl $268435968,%r10d orl %r9d,%r10d cmpl $1342177792,%r10d je .Lavx_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +51,8 @@ sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: movq 0(%rdi),%rax @@ -1697,16 +1716,25 @@ sha512_block_data_order: movq %r11,56(%rdi) jb .Lloop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order,.-sha512_block_data_order .align 64 .type K512,@object @@ -1795,1100 +1823,25 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha512_block_data_order_xop,@function -.align 64 -sha512_block_data_order_xop: -.Lxop_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) -.Lprologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lxop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_xop - - movq 128+24(%rsp),%rsi - vzeroupper - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.size sha512_block_data_order_xop,.-sha512_block_data_order_xop .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +1849,8 @@ sha512_block_data_order_avx: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -4013,16 +2967,26 @@ sha512_block_data_order_avx: movq %r11,56(%rdi) jb .Lloop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S index f3a089de9c..27a34617a3 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -19,6 +31,7 @@ .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: +.cfi_startproc movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -99,6 +112,7 @@ _vpaes_encrypt_core: pxor %xmm4,%xmm0 .byte 102,15,56,0,193 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core @@ -106,9 +120,185 @@ _vpaes_encrypt_core: + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core_2x,@function +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa .Lk_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + + movdqa .Lk_sb1(%rip),%xmm4 + movdqa .Lk_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa .Lk_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa .Lk_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +.Lenc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz .Lenc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + .byte 0xf3,0xc3 +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + + + + + + .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: +.cfi_startproc movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -205,6 +395,7 @@ _vpaes_decrypt_core: pxor %xmm4,%xmm0 .byte 102,15,56,0,194 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_decrypt_core,.-_vpaes_decrypt_core @@ -215,6 +406,7 @@ _vpaes_decrypt_core: .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: +.cfi_startproc @@ -381,6 +573,7 @@ _vpaes_schedule_core: pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -400,6 +593,7 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: +.cfi_startproc pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -408,6 +602,7 @@ _vpaes_schedule_192_smear: movdqa %xmm6,%xmm0 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear @@ -431,6 +626,7 @@ _vpaes_schedule_192_smear: .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: +.cfi_startproc pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 @@ -484,6 +680,7 @@ _vpaes_schedule_low_round: pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_round,.-_vpaes_schedule_round @@ -498,6 +695,7 @@ _vpaes_schedule_low_round: .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: +.cfi_startproc movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -508,6 +706,7 @@ _vpaes_schedule_transform: .byte 102,15,56,0,193 pxor %xmm2,%xmm0 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_transform,.-_vpaes_schedule_transform @@ -536,6 +735,7 @@ _vpaes_schedule_transform: .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: +.cfi_startproc movdqa %xmm0,%xmm4 movdqa .Lk_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -600,6 +800,7 @@ _vpaes_schedule_mangle: andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -610,6 +811,13 @@ _vpaes_schedule_mangle: .type vpaes_set_encrypt_key,@function .align 16 vpaes_set_encrypt_key: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+5(%rip) +#endif + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -620,6 +828,7 @@ vpaes_set_encrypt_key: call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key .globl vpaes_set_decrypt_key @@ -627,6 +836,7 @@ vpaes_set_encrypt_key: .type vpaes_set_decrypt_key,@function .align 16 vpaes_set_decrypt_key: +.cfi_startproc movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -642,6 +852,7 @@ vpaes_set_decrypt_key: call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key .globl vpaes_encrypt @@ -649,11 +860,18 @@ vpaes_set_decrypt_key: .type vpaes_encrypt,@function .align 16 vpaes_encrypt: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+4(%rip) +#endif movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_encrypt,.-vpaes_encrypt .globl vpaes_decrypt @@ -661,17 +879,20 @@ vpaes_encrypt: .type vpaes_decrypt,@function .align 16 vpaes_decrypt: +.cfi_startproc movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_decrypt,.-vpaes_decrypt .globl vpaes_cbc_encrypt .hidden vpaes_cbc_encrypt .type vpaes_cbc_encrypt,@function .align 16 vpaes_cbc_encrypt: +.cfi_startproc xchgq %rcx,%rdx subq $16,%rcx jc .Lcbc_abort @@ -707,7 +928,71 @@ vpaes_cbc_encrypt: movdqu %xmm6,(%r8) .Lcbc_abort: .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,@function +.align 16 +vpaes_ctr32_encrypt_blocks: +.cfi_startproc + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz .Lctr32_abort + movdqu (%r8),%xmm0 + movdqa .Lctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb .Lrev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz .Lctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz .Lctr32_done + +.Lctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa .Lctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz .Lctr32_loop + +.Lctr32_done: +.Lctr32_abort: + .byte 0xf3,0xc3 +.cfi_endproc +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks @@ -717,6 +1002,7 @@ vpaes_cbc_encrypt: .type _vpaes_preheat,@function .align 16 _vpaes_preheat: +.cfi_startproc leaq .Lk_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -726,6 +1012,7 @@ _vpaes_preheat: movdqa 80(%r10),%xmm15 movdqa 96(%r10),%xmm14 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat @@ -828,7 +1115,19 @@ _vpaes_consts: .Lk_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +.Lrev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +.Lctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S index b32e2f0ef4..bdb4454212 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -17,6 +29,8 @@ bn_mul_mont: jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d @@ -208,31 +222,30 @@ bn_mul_mont: xorq %r14,%r14 movq (%rsp),%rax - leaq (%rsp),%rsi movq %r9,%r15 - jmp .Lsub + .align 16 -.Lsub: - sbbq (%rcx,%r14,8),%rax +.Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax + movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz .Lsub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.align 16 + .Lcopy: - movq (%rsi,%r14,8),%rax - movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy @@ -266,6 +279,9 @@ bn_mul4x_mont: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -603,7 +619,6 @@ bn_mul4x_mont: movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax - pxor %xmm0,%xmm0 movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi @@ -613,8 +628,7 @@ bn_mul4x_mont: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - jmp .Lsub4x -.align 16 + .Lsub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) @@ -641,34 +655,35 @@ bn_mul4x_mont: sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) - xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx - leaq -4(%r9),%r15 - orq %rcx,%rsi + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 shrq $2,%r15 + xorl %eax,%eax - movdqu (%rsi),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,(%rdi) jmp .Lcopy4x .align 16 .Lcopy4x: - movdqu 16(%rsi,%r14,1),%xmm2 - movdqu 32(%rsi,%r14,1),%xmm1 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movdqa %xmm0,32(%rsp,%r14,1) - movdqu %xmm1,32(%rdi,%r14,1) - leaq 32(%r14),%r14 + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax decq %r15 jnz .Lcopy4x - - movdqu 16(%rsi,%r14,1),%xmm2 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi, 8 movq $1,%rax @@ -690,6 +705,8 @@ bn_mul4x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont +.extern bn_sqrx8x_internal +.hidden bn_sqrx8x_internal .extern bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -774,6 +791,26 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + leaq OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: call bn_sqr8x_internal @@ -861,6 +898,363 @@ bn_sqr8x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S index 208b1dca3e..c86b3b0a59 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -15,6 +27,8 @@ bn_mul_mont_gather5: .cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d jmp .Lmul4x_enter .align 16 @@ -396,8 +410,7 @@ bn_mul_mont_gather5: movq %r9,%r15 jmp .Lsub .align 16 -.Lsub: - sbbq (%rcx,%r14,8),%rax +.Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -405,18 +418,19 @@ bn_mul_mont_gather5: jnz .Lsub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.align 16 + .Lcopy: - movq (%rsi,%r14,8),%rax + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy @@ -451,6 +465,9 @@ bn_mul4x_mont_gather5: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -549,6 +566,7 @@ bn_mul4x_mont_gather5: .type mul4x_internal,@function .align 32 mul4x_internal: +.cfi_startproc shlq $5,%r9 movd 8(%rax),%xmm5 leaq .Linc(%rip),%rax @@ -1070,6 +1088,7 @@ mul4x_internal: movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry +.cfi_endproc .size mul4x_internal,.-mul4x_internal .globl bn_power5 .hidden bn_power5 @@ -1079,6 +1098,11 @@ bn_power5: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1210,6 +1234,7 @@ bn_power5: .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: +.cfi_startproc @@ -1984,10 +2009,12 @@ __bn_sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop .byte 0xf3,0xc3 +.cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal .type __bn_post4x_internal,@function .align 32 __bn_post4x_internal: +.cfi_startproc movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx @@ -2038,16 +2065,19 @@ __bn_post4x_internal: movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 +.cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .hidden bn_from_montgomery .type bn_from_montgomery,@function .align 32 bn_from_montgomery: +.cfi_startproc testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,@function @@ -2164,6 +2194,22 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne .Lfrom_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2202,11 +2248,1356 @@ bn_from_mont8x: .byte 0xf3,0xc3 .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: +.cfi_startproc + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +.type __bn_postx4x_internal,@function +__bn_postx4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_scatter5 .hidden bn_scatter5 .type bn_scatter5,@function .align 16 bn_scatter5: +.cfi_startproc cmpl $0,%esi jz .Lscatter_epilogue leaq (%rdx,%rcx,8),%rdx @@ -2219,6 +3610,7 @@ bn_scatter5: jnz .Lscatter .Lscatter_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 @@ -2226,9 +3618,11 @@ bn_scatter5: .type bn_gather5,@function .align 32 bn_gather5: +.cfi_startproc .LSEH_begin_bn_gather5: .byte 0x4c,0x8d,0x14,0x24 +.cfi_def_cfa_register %r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq .Linc(%rip),%rax andq $-16,%rsp @@ -2382,8 +3776,10 @@ bn_gather5: jnz .Lgather leaq (%r10),%rsp +.cfi_def_cfa_register %rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: +.cfi_endproc .size bn_gather5,.-bn_gather5 .align 64 .Linc: @@ -2391,3 +3787,4 @@ bn_gather5: .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S new file mode 100644 index 0000000000..9f7c0d817c --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S @@ -0,0 +1,518 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + + + + +.type abi_test_trampoline, @function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 16 +abi_test_trampoline: +.Labi_test_trampoline_seh_begin: +.cfi_startproc + + + + + + + + + + subq $120,%rsp +.cfi_adjust_cfa_offset 120 +.Labi_test_trampoline_seh_prolog_alloc: + movq %r8,48(%rsp) + movq %rbx,64(%rsp) +.cfi_offset rbx, -64 +.Labi_test_trampoline_seh_prolog_rbx: + movq %rbp,72(%rsp) +.cfi_offset rbp, -56 +.Labi_test_trampoline_seh_prolog_rbp: + movq %r12,80(%rsp) +.cfi_offset r12, -48 +.Labi_test_trampoline_seh_prolog_r12: + movq %r13,88(%rsp) +.cfi_offset r13, -40 +.Labi_test_trampoline_seh_prolog_r13: + movq %r14,96(%rsp) +.cfi_offset r14, -32 +.Labi_test_trampoline_seh_prolog_r14: + movq %r15,104(%rsp) +.cfi_offset r15, -24 +.Labi_test_trampoline_seh_prolog_r15: +.Labi_test_trampoline_seh_prolog_end: + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + + + + + movq %rdx,%r10 + movq %rcx,%r11 + decq %r11 + js .Largs_done + movq (%r10),%rdi + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rsi + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rdx + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rcx + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%r8 + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%r9 + addq $8,%r10 + leaq 0(%rsp),%rax +.Largs_loop: + decq %r11 + js .Largs_done + + + + + + + movq %r11,56(%rsp) + movq (%r10),%r11 + movq %r11,(%rax) + movq 56(%rsp),%r11 + + addq $8,%r10 + addq $8,%rax + jmp .Largs_loop + +.Largs_done: + movq 32(%rsp),%rax + movq 48(%rsp),%r10 + testq %r10,%r10 + jz .Lno_unwind + + + pushfq + orq $0x100,0(%rsp) + popfq + + + + nop +.globl abi_test_unwind_start +.hidden abi_test_unwind_start +abi_test_unwind_start: + + call *%rax +.globl abi_test_unwind_return +.hidden abi_test_unwind_return +abi_test_unwind_return: + + + + + pushfq + andq $-0x101,0(%rsp) + popfq +.globl abi_test_unwind_stop +.hidden abi_test_unwind_stop +abi_test_unwind_stop: + + jmp .Lcall_done + +.Lno_unwind: + call *%rax + +.Lcall_done: + + movq 40(%rsp),%rsi + movq %rbx,0(%rsi) + movq %rbp,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq 64(%rsp),%rbx +.cfi_restore rbx + movq 72(%rsp),%rbp +.cfi_restore rbp + movq 80(%rsp),%r12 +.cfi_restore r12 + movq 88(%rsp),%r13 +.cfi_restore r13 + movq 96(%rsp),%r14 +.cfi_restore r14 + movq 104(%rsp),%r15 +.cfi_restore r15 + addq $120,%rsp +.cfi_adjust_cfa_offset -120 + + + .byte 0xf3,0xc3 +.cfi_endproc +.Labi_test_trampoline_seh_end: +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_rax, @function +.globl abi_test_clobber_rax +.hidden abi_test_clobber_rax +.align 16 +abi_test_clobber_rax: + xorq %rax,%rax + .byte 0xf3,0xc3 +.size abi_test_clobber_rax,.-abi_test_clobber_rax +.type abi_test_clobber_rbx, @function +.globl abi_test_clobber_rbx +.hidden abi_test_clobber_rbx +.align 16 +abi_test_clobber_rbx: + xorq %rbx,%rbx + .byte 0xf3,0xc3 +.size abi_test_clobber_rbx,.-abi_test_clobber_rbx +.type abi_test_clobber_rcx, @function +.globl abi_test_clobber_rcx +.hidden abi_test_clobber_rcx +.align 16 +abi_test_clobber_rcx: + xorq %rcx,%rcx + .byte 0xf3,0xc3 +.size abi_test_clobber_rcx,.-abi_test_clobber_rcx +.type abi_test_clobber_rdx, @function +.globl abi_test_clobber_rdx +.hidden abi_test_clobber_rdx +.align 16 +abi_test_clobber_rdx: + xorq %rdx,%rdx + .byte 0xf3,0xc3 +.size abi_test_clobber_rdx,.-abi_test_clobber_rdx +.type abi_test_clobber_rdi, @function +.globl abi_test_clobber_rdi +.hidden abi_test_clobber_rdi +.align 16 +abi_test_clobber_rdi: + xorq %rdi,%rdi + .byte 0xf3,0xc3 +.size abi_test_clobber_rdi,.-abi_test_clobber_rdi +.type abi_test_clobber_rsi, @function +.globl abi_test_clobber_rsi +.hidden abi_test_clobber_rsi +.align 16 +abi_test_clobber_rsi: + xorq %rsi,%rsi + .byte 0xf3,0xc3 +.size abi_test_clobber_rsi,.-abi_test_clobber_rsi +.type abi_test_clobber_rbp, @function +.globl abi_test_clobber_rbp +.hidden abi_test_clobber_rbp +.align 16 +abi_test_clobber_rbp: + xorq %rbp,%rbp + .byte 0xf3,0xc3 +.size abi_test_clobber_rbp,.-abi_test_clobber_rbp +.type abi_test_clobber_r8, @function +.globl abi_test_clobber_r8 +.hidden abi_test_clobber_r8 +.align 16 +abi_test_clobber_r8: + xorq %r8,%r8 + .byte 0xf3,0xc3 +.size abi_test_clobber_r8,.-abi_test_clobber_r8 +.type abi_test_clobber_r9, @function +.globl abi_test_clobber_r9 +.hidden abi_test_clobber_r9 +.align 16 +abi_test_clobber_r9: + xorq %r9,%r9 + .byte 0xf3,0xc3 +.size abi_test_clobber_r9,.-abi_test_clobber_r9 +.type abi_test_clobber_r10, @function +.globl abi_test_clobber_r10 +.hidden abi_test_clobber_r10 +.align 16 +abi_test_clobber_r10: + xorq %r10,%r10 + .byte 0xf3,0xc3 +.size abi_test_clobber_r10,.-abi_test_clobber_r10 +.type abi_test_clobber_r11, @function +.globl abi_test_clobber_r11 +.hidden abi_test_clobber_r11 +.align 16 +abi_test_clobber_r11: + xorq %r11,%r11 + .byte 0xf3,0xc3 +.size abi_test_clobber_r11,.-abi_test_clobber_r11 +.type abi_test_clobber_r12, @function +.globl abi_test_clobber_r12 +.hidden abi_test_clobber_r12 +.align 16 +abi_test_clobber_r12: + xorq %r12,%r12 + .byte 0xf3,0xc3 +.size abi_test_clobber_r12,.-abi_test_clobber_r12 +.type abi_test_clobber_r13, @function +.globl abi_test_clobber_r13 +.hidden abi_test_clobber_r13 +.align 16 +abi_test_clobber_r13: + xorq %r13,%r13 + .byte 0xf3,0xc3 +.size abi_test_clobber_r13,.-abi_test_clobber_r13 +.type abi_test_clobber_r14, @function +.globl abi_test_clobber_r14 +.hidden abi_test_clobber_r14 +.align 16 +abi_test_clobber_r14: + xorq %r14,%r14 + .byte 0xf3,0xc3 +.size abi_test_clobber_r14,.-abi_test_clobber_r14 +.type abi_test_clobber_r15, @function +.globl abi_test_clobber_r15 +.hidden abi_test_clobber_r15 +.align 16 +abi_test_clobber_r15: + xorq %r15,%r15 + .byte 0xf3,0xc3 +.size abi_test_clobber_r15,.-abi_test_clobber_r15 +.type abi_test_clobber_xmm0, @function +.globl abi_test_clobber_xmm0 +.hidden abi_test_clobber_xmm0 +.align 16 +abi_test_clobber_xmm0: + pxor %xmm0,%xmm0 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm0,.-abi_test_clobber_xmm0 +.type abi_test_clobber_xmm1, @function +.globl abi_test_clobber_xmm1 +.hidden abi_test_clobber_xmm1 +.align 16 +abi_test_clobber_xmm1: + pxor %xmm1,%xmm1 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm1,.-abi_test_clobber_xmm1 +.type abi_test_clobber_xmm2, @function +.globl abi_test_clobber_xmm2 +.hidden abi_test_clobber_xmm2 +.align 16 +abi_test_clobber_xmm2: + pxor %xmm2,%xmm2 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm2,.-abi_test_clobber_xmm2 +.type abi_test_clobber_xmm3, @function +.globl abi_test_clobber_xmm3 +.hidden abi_test_clobber_xmm3 +.align 16 +abi_test_clobber_xmm3: + pxor %xmm3,%xmm3 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm3,.-abi_test_clobber_xmm3 +.type abi_test_clobber_xmm4, @function +.globl abi_test_clobber_xmm4 +.hidden abi_test_clobber_xmm4 +.align 16 +abi_test_clobber_xmm4: + pxor %xmm4,%xmm4 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm4,.-abi_test_clobber_xmm4 +.type abi_test_clobber_xmm5, @function +.globl abi_test_clobber_xmm5 +.hidden abi_test_clobber_xmm5 +.align 16 +abi_test_clobber_xmm5: + pxor %xmm5,%xmm5 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm5,.-abi_test_clobber_xmm5 +.type abi_test_clobber_xmm6, @function +.globl abi_test_clobber_xmm6 +.hidden abi_test_clobber_xmm6 +.align 16 +abi_test_clobber_xmm6: + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm6,.-abi_test_clobber_xmm6 +.type abi_test_clobber_xmm7, @function +.globl abi_test_clobber_xmm7 +.hidden abi_test_clobber_xmm7 +.align 16 +abi_test_clobber_xmm7: + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm7,.-abi_test_clobber_xmm7 +.type abi_test_clobber_xmm8, @function +.globl abi_test_clobber_xmm8 +.hidden abi_test_clobber_xmm8 +.align 16 +abi_test_clobber_xmm8: + pxor %xmm8,%xmm8 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm8,.-abi_test_clobber_xmm8 +.type abi_test_clobber_xmm9, @function +.globl abi_test_clobber_xmm9 +.hidden abi_test_clobber_xmm9 +.align 16 +abi_test_clobber_xmm9: + pxor %xmm9,%xmm9 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm9,.-abi_test_clobber_xmm9 +.type abi_test_clobber_xmm10, @function +.globl abi_test_clobber_xmm10 +.hidden abi_test_clobber_xmm10 +.align 16 +abi_test_clobber_xmm10: + pxor %xmm10,%xmm10 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm10,.-abi_test_clobber_xmm10 +.type abi_test_clobber_xmm11, @function +.globl abi_test_clobber_xmm11 +.hidden abi_test_clobber_xmm11 +.align 16 +abi_test_clobber_xmm11: + pxor %xmm11,%xmm11 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm11,.-abi_test_clobber_xmm11 +.type abi_test_clobber_xmm12, @function +.globl abi_test_clobber_xmm12 +.hidden abi_test_clobber_xmm12 +.align 16 +abi_test_clobber_xmm12: + pxor %xmm12,%xmm12 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm12,.-abi_test_clobber_xmm12 +.type abi_test_clobber_xmm13, @function +.globl abi_test_clobber_xmm13 +.hidden abi_test_clobber_xmm13 +.align 16 +abi_test_clobber_xmm13: + pxor %xmm13,%xmm13 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm13,.-abi_test_clobber_xmm13 +.type abi_test_clobber_xmm14, @function +.globl abi_test_clobber_xmm14 +.hidden abi_test_clobber_xmm14 +.align 16 +abi_test_clobber_xmm14: + pxor %xmm14,%xmm14 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm14,.-abi_test_clobber_xmm14 +.type abi_test_clobber_xmm15, @function +.globl abi_test_clobber_xmm15 +.hidden abi_test_clobber_xmm15 +.align 16 +abi_test_clobber_xmm15: + pxor %xmm15,%xmm15 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm15,.-abi_test_clobber_xmm15 + + + +.type abi_test_bad_unwind_wrong_register, @function +.globl abi_test_bad_unwind_wrong_register +.hidden abi_test_bad_unwind_wrong_register +.align 16 +abi_test_bad_unwind_wrong_register: +.cfi_startproc +.Labi_test_bad_unwind_wrong_register_seh_begin: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-16 +.Labi_test_bad_unwind_wrong_register_seh_push_r13: + + + + nop + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + .byte 0xf3,0xc3 +.Labi_test_bad_unwind_wrong_register_seh_end: +.cfi_endproc +.size abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register + + + + +.type abi_test_bad_unwind_temporary, @function +.globl abi_test_bad_unwind_temporary +.hidden abi_test_bad_unwind_temporary +.align 16 +abi_test_bad_unwind_temporary: +.cfi_startproc +.Labi_test_bad_unwind_temporary_seh_begin: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 +.Labi_test_bad_unwind_temporary_seh_push_r12: + + movq %r12,%rax + incq %rax + movq %rax,(%rsp) + + + + movq %r12,(%rsp) + + + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + .byte 0xf3,0xc3 +.Labi_test_bad_unwind_temporary_seh_end: +.cfi_endproc +.size abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary + + + + +.type abi_test_set_direction_flag, @function +.globl abi_test_get_and_clear_direction_flag +.hidden abi_test_get_and_clear_direction_flag +abi_test_get_and_clear_direction_flag: + pushfq + popq %rax + andq $0x400,%rax + shrq $10,%rax + cld + .byte 0xf3,0xc3 +.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag + + + +.type abi_test_set_direction_flag, @function +.globl abi_test_set_direction_flag +.hidden abi_test_set_direction_flag +abi_test_set_direction_flag: + std + .byte 0xf3,0xc3 +.size abi_test_set_direction_flag,.-abi_test_set_direction_flag +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S new file mode 100644 index 0000000000..07f708aa72 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S @@ -0,0 +1,1871 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +.Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF +.quad 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC +.quad 0x0004683E4E2EE688 + + +.Lp434p1: +.quad 0xFDC1767AE3000000 +.quad 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056 +.quad 0x0002341F27177344 + +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P +.globl sike_fpadd +.hidden sike_fpadd +.type sike_fpadd,@function +sike_fpadd: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + addq 0(%rsi),%r8 + movq 8(%rdi),%r9 + adcq 8(%rsi),%r9 + movq 16(%rdi),%r10 + adcq 16(%rsi),%r10 + movq 24(%rdi),%r11 + adcq 24(%rsi),%r11 + movq 32(%rdi),%r12 + adcq 32(%rsi),%r12 + movq 40(%rdi),%r13 + adcq 40(%rsi),%r13 + movq 48(%rdi),%r14 + adcq 48(%rsi),%r14 + + movq .Lp434x2(%rip),%rcx + subq %rcx,%r8 + movq 8+.Lp434x2(%rip),%rcx + sbbq %rcx,%r9 + sbbq %rcx,%r10 + movq 16+.Lp434x2(%rip),%rcx + sbbq %rcx,%r11 + movq 24+.Lp434x2(%rip),%rcx + sbbq %rcx,%r12 + movq 32+.Lp434x2(%rip),%rcx + sbbq %rcx,%r13 + movq 40+.Lp434x2(%rip),%rcx + sbbq %rcx,%r14 + + sbbq $0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0,%rcx + + adcq %r8,%r12 + movq %r12,32(%rdx) + adcq %r9,%r13 + movq %r13,40(%rdx) + adcq %r10,%r14 + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_cswap_asm +.hidden sike_cswap_asm +.type sike_cswap_asm,@function +sike_cswap_asm: + + + movq %rdx,%xmm3 + + + + + + pshufd $68,%xmm3,%xmm3 + + movdqu 0(%rdi),%xmm0 + movdqu 0(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,0(%rdi) + movdqu %xmm1,0(%rsi) + + movdqu 16(%rdi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,16(%rdi) + movdqu %xmm1,16(%rsi) + + movdqu 32(%rdi),%xmm0 + movdqu 32(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,32(%rdi) + movdqu %xmm1,32(%rsi) + + movdqu 48(%rdi),%xmm0 + movdqu 48(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,48(%rdi) + movdqu %xmm1,48(%rsi) + + movdqu 64(%rdi),%xmm0 + movdqu 64(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,64(%rdi) + movdqu %xmm1,64(%rsi) + + movdqu 80(%rdi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,80(%rdi) + movdqu %xmm1,80(%rsi) + + movdqu 96(%rdi),%xmm0 + movdqu 96(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,96(%rdi) + movdqu %xmm1,96(%rsi) + + movdqu 112(%rdi),%xmm0 + movdqu 112(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,112(%rdi) + movdqu %xmm1,112(%rsi) + + movdqu 128(%rdi),%xmm0 + movdqu 128(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,128(%rdi) + movdqu %xmm1,128(%rsi) + + movdqu 144(%rdi),%xmm0 + movdqu 144(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,144(%rdi) + movdqu %xmm1,144(%rsi) + + movdqu 160(%rdi),%xmm0 + movdqu 160(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,160(%rdi) + movdqu %xmm1,160(%rsi) + + movdqu 176(%rdi),%xmm0 + movdqu 176(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,176(%rdi) + movdqu %xmm1,176(%rsi) + + movdqu 192(%rdi),%xmm0 + movdqu 192(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,192(%rdi) + movdqu %xmm1,192(%rsi) + + movdqu 208(%rdi),%xmm0 + movdqu 208(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,208(%rdi) + movdqu %xmm1,208(%rsi) + + .byte 0xf3,0xc3 +.globl sike_fpsub +.hidden sike_fpsub +.type sike_fpsub,@function +sike_fpsub: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + subq 0(%rsi),%r8 + movq 8(%rdi),%r9 + sbbq 8(%rsi),%r9 + movq 16(%rdi),%r10 + sbbq 16(%rsi),%r10 + movq 24(%rdi),%r11 + sbbq 24(%rsi),%r11 + movq 32(%rdi),%r12 + sbbq 32(%rsi),%r12 + movq 40(%rdi),%r13 + sbbq 40(%rsi),%r13 + movq 48(%rdi),%r14 + sbbq 48(%rsi),%r14 + + sbbq $0x0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0x0,%rcx + + adcq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpadd_asm +.hidden sike_mpadd_asm +.type sike_mpadd_asm,@function +sike_mpadd_asm: +.cfi_startproc + movq 0(%rdi),%r8; + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + adcq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + adcq 40(%rsi),%r8 + adcq 48(%rsi),%r9 + movq %r8,40(%rdx) + movq %r9,48(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpsubx2_asm +.hidden sike_mpsubx2_asm +.type sike_mpsubx2_asm,@function +sike_mpsubx2_asm: +.cfi_startproc + xorq %rax,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + movq 56(%rdi),%r10 + movq 64(%rdi),%r11 + movq 72(%rdi),%rcx + sbbq 40(%rsi),%r8 + sbbq 48(%rsi),%r9 + sbbq 56(%rsi),%r10 + sbbq 64(%rsi),%r11 + sbbq 72(%rsi),%rcx + movq %r8,40(%rdx) + movq %r9,48(%rdx) + movq %r10,56(%rdx) + movq %r11,64(%rdx) + movq %rcx,72(%rdx) + + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + sbbq 80(%rsi),%r8 + sbbq 88(%rsi),%r9 + sbbq 96(%rsi),%r10 + sbbq 104(%rsi),%r11 + sbbq $0x0,%rax + movq %r8,80(%rdx) + movq %r9,88(%rdx) + movq %r10,96(%rdx) + movq %r11,104(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpdblsubx2_asm +.hidden sike_mpdblsubx2_asm +.type sike_mpdblsubx2_asm,@function +sike_mpdblsubx2_asm: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + + xorq %rax,%rax + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + movq 48(%rdx),%rcx + subq 0(%rdi),%r8 + sbbq 8(%rdi),%r9 + sbbq 16(%rdi),%r10 + sbbq 24(%rdi),%r11 + sbbq 32(%rdi),%r12 + sbbq 40(%rdi),%r13 + sbbq 48(%rdi),%rcx + adcq $0x0,%rax + + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%r12 + sbbq 40(%rsi),%r13 + sbbq 48(%rsi),%rcx + adcq $0x0,%rax + + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %rcx,48(%rdx) + + + movq 56(%rdx),%r8 + movq 64(%rdx),%r9 + movq 72(%rdx),%r10 + movq 80(%rdx),%r11 + movq 88(%rdx),%r12 + movq 96(%rdx),%r13 + movq 104(%rdx),%rcx + + subq %rax,%r8 + sbbq 56(%rdi),%r8 + sbbq 64(%rdi),%r9 + sbbq 72(%rdi),%r10 + sbbq 80(%rdi),%r11 + sbbq 88(%rdi),%r12 + sbbq 96(%rdi),%r13 + sbbq 104(%rdi),%rcx + + + subq 56(%rsi),%r8 + sbbq 64(%rsi),%r9 + sbbq 72(%rsi),%r10 + sbbq 80(%rsi),%r11 + sbbq 88(%rsi),%r12 + sbbq 96(%rsi),%r13 + sbbq 104(%rsi),%rcx + + + movq %r8,56(%rdx) + movq %r9,64(%rdx) + movq %r10,72(%rdx) + movq %r11,80(%rdx) + movq %r12,88(%rdx) + movq %r13,96(%rdx) + movq %rcx,104(%rdx) + + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc + +.Lrdc_bdw: +.cfi_startproc + +.cfi_adjust_cfa_offset 32 +.cfi_offset r12, -16 +.cfi_offset r13, -24 +.cfi_offset r14, -32 +.cfi_offset r15, -40 + + xorq %rax,%rax + movq 0+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 0+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + xorq %rax,%rax + movq 16+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 16+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + xorq %rax,%rax + movq 32+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 32+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + xorq %rax,%rax + movq 48(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r12 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_fprdc +.hidden sike_fprdc +.type sike_fprdc,@function +sike_fprdc: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + + + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lrdc_bdw + + + + + movq 0+0(%rdi),%r14 + movq 0+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 0+8(%rdi),%rcx + movq 0+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + + movq 16+0(%rdi),%r14 + movq 0+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 16+8(%rdi),%rcx + movq 0+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + + movq 32+0(%rdi),%r14 + movq 0+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 32+8(%rdi),%rcx + movq 0+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + movq 48(%rdi),%r13 + + xorq %r10,%r10 + movq 0+.Lp434p1(%rip),%rax + mulq %r13 + movq %rax,%r8 + movq %rdx,%r9 + + xorq %r11,%r11 + movq 8+.Lp434p1(%rip),%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r11 + + movq 24+.Lp434p1(%rip),%rax + mulq %r13 + addq %rax,%r11 + adcq %rdx,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.Lmul_bdw: +.cfi_startproc + +.cfi_adjust_cfa_offset 32 +.cfi_offset r12, -16 +.cfi_offset r13, -24 +.cfi_offset r14, -32 +.cfi_offset r15, -40 + + + movq %rdx,%rcx + xorq %rax,%rax + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx, -48 + pushq %rbp +.cfi_offset rbp, -56 +.cfi_adjust_cfa_offset 8 + subq $96,%rsp +.cfi_adjust_cfa_offset 96 + + addq 32(%rdi),%r8 + adcq 40(%rdi),%r9 + adcq 48(%rdi),%r10 + adcq $0x0,%r11 + sbbq $0x0,%rax + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + + + xorq %rbx,%rbx + movq 0(%rsi),%r12 + movq 8(%rsi),%r13 + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + addq 32(%rsi),%r12 + adcq 40(%rsi),%r13 + adcq 48(%rsi),%r14 + adcq $0x0,%r15 + sbbq $0x0,%rbx + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + + + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + andq %rbx,%r8 + andq %rbx,%r9 + andq %rbx,%r10 + andq %rbx,%r11 + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + + movq 0+0(%rsp),%rdx + mulxq 32+0(%rsp),%r9,%r8 + movq %r9,0+0(%rsp) + mulxq 32+8(%rsp),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsp),%r11,%r10 + adoxq %r11,%r9 + mulxq 32+24(%rsp),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rsp),%rdx + mulxq 32+0(%rsp),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rsp) + adcxq %r15,%r13 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 32+8(%rsp),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rsp) + adcxq %r11,%r9 + mulxq 32+16(%rsp),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 32+24(%rsp),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r13 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rsp) + movq %r10,0+32(%rsp) + movq %r11,0+40(%rsp) + movq %r12,0+48(%rsp) + movq %rbx,0+56(%rsp) + + + + movq 0+0(%rdi),%rdx + mulxq 0+0(%rsi),%r9,%r8 + movq %r9,0+0(%rcx) + mulxq 0+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 0+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + mulxq 0+24(%rsi),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rdi),%rdx + mulxq 0+0(%rsi),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rcx) + adcxq %r15,%r13 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 0+8(%rsi),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rcx) + adcxq %r11,%r9 + mulxq 0+16(%rsi),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 0+24(%rsi),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r13 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rcx) + movq %r10,0+32(%rcx) + movq %r11,0+40(%rcx) + movq %r12,0+48(%rcx) + movq %rbx,0+56(%rcx) + + + + movq 32+0(%rdi),%rdx + mulxq 32+0(%rsi),%r9,%r8 + movq %r9,64+0(%rcx) + mulxq 32+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + + movq 32+8(%rdi),%rdx + mulxq 32+0(%rsi),%r12,%r11 + adoxq %rax,%r10 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r14,%r13 + adoxq %r8,%r12 + movq %r12,64+8(%rcx) + adcxq %r14,%r11 + + mulxq 32+16(%rsi),%r8,%r14 + adoxq %r9,%r11 + adcxq %r8,%r13 + adcxq %rax,%r14 + adoxq %r10,%r13 + + movq 32+16(%rdi),%rdx + mulxq 32+0(%rsi),%r8,%r9 + adoxq %rax,%r14 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r10,%r12 + adoxq %r11,%r8 + movq %r8,64+16(%rcx) + adcxq %r13,%r9 + + mulxq 32+16(%rsi),%r11,%r8 + adcxq %r14,%r12 + adcxq %rax,%r8 + adoxq %r10,%r9 + adoxq %r12,%r11 + adoxq %rax,%r8 + movq %r9,64+24(%rcx) + movq %r11,64+32(%rcx) + movq %r8,64+40(%rcx) + + + + + movq 64(%rsp),%r8 + movq 72(%rsp),%r9 + movq 80(%rsp),%r10 + movq 88(%rsp),%r11 + + movq 32(%rsp),%rax + addq %rax,%r8 + movq 40(%rsp),%rax + adcq %rax,%r9 + movq 48(%rsp),%rax + adcq %rax,%r10 + movq 56(%rsp),%rax + adcq %rax,%r11 + + + movq 0(%rsp),%r12 + movq 8(%rsp),%r13 + movq 16(%rsp),%r14 + movq 24(%rsp),%r15 + subq 0(%rcx),%r12 + sbbq 8(%rcx),%r13 + sbbq 16(%rcx),%r14 + sbbq 24(%rcx),%r15 + sbbq 32(%rcx),%r8 + sbbq 40(%rcx),%r9 + sbbq 48(%rcx),%r10 + sbbq 56(%rcx),%r11 + + + subq 64(%rcx),%r12 + sbbq 72(%rcx),%r13 + sbbq 80(%rcx),%r14 + sbbq 88(%rcx),%r15 + sbbq 96(%rcx),%r8 + sbbq 104(%rcx),%r9 + sbbq $0x0,%r10 + sbbq $0x0,%r11 + + addq 32(%rcx),%r12 + movq %r12,32(%rcx) + adcq 40(%rcx),%r13 + movq %r13,40(%rcx) + adcq 48(%rcx),%r14 + movq %r14,48(%rcx) + adcq 56(%rcx),%r15 + movq %r15,56(%rcx) + adcq 64(%rcx),%r8 + movq %r8,64(%rcx) + adcq 72(%rcx),%r9 + movq %r9,72(%rcx) + adcq 80(%rcx),%r10 + movq %r10,80(%rcx) + adcq 88(%rcx),%r11 + movq %r11,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + movq 104(%rcx),%r13 + adcq $0x0,%r13 + movq %r13,104(%rcx) + + addq $96,%rsp +.cfi_adjust_cfa_offset -96 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbx + + + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r12 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl sike_mpmul +.hidden sike_mpmul +.type sike_mpmul,@function +sike_mpmul: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + + + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lmul_bdw + + + + movq %rdx,%rcx + + subq $112,%rsp +.cfi_adjust_cfa_offset 112 + + + xorq %rax,%rax + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + xorq %r11,%r11 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + + sbbq $0,%rax + movq %rax,64(%rsp) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + + + xorq %rdx,%rdx + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + xorq %r15,%r15 + addq 0(%rsi),%r12 + adcq 8(%rsi),%r13 + adcq 16(%rsi),%r14 + adcq 24(%rsi),%r15 + sbbq $0x0,%rdx + + movq %rdx,72(%rsp) + + + movq (%rcx),%rax + mulq %r12 + movq %rax,(%rsp) + movq %rdx,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq (%rcx),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rcx),%rax + mulq %r12 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rcx),%rax + mulq %r13 + addq %rax,%r9 + movq %r9,16(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 24(%rcx),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rcx),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rcx),%rax + mulq %r13 + addq %rax,%r10 + movq %r10,24(%rsp) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 24(%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rcx),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r11,%r11 + movq 16(%rcx),%rax + mulq %r15 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r14 + addq %rax,%r9 + movq %r9,40(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rsp) + adcq %rdx,%r11 + movq %r11,56(%rsp) + + + movq 64(%rsp),%rax + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + movq 72(%rsp),%rax + movq 0(%rcx),%r8 + andq %rax,%r8 + movq 8(%rcx),%r9 + andq %rax,%r9 + movq 16(%rcx),%r10 + andq %rax,%r10 + movq 24(%rcx),%r11 + andq %rax,%r11 + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + + + movq 32(%rsp),%rax + addq %rax,%r12 + movq 40(%rsp),%rax + adcq %rax,%r13 + movq 48(%rsp),%rax + adcq %rax,%r14 + movq 56(%rsp),%rax + adcq %rax,%r15 + movq %r12,80(%rsp) + movq %r13,88(%rsp) + movq %r14,96(%rsp) + movq %r15,104(%rsp) + + + movq (%rdi),%r11 + movq (%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,(%rcx) + movq %rdx,%r8 + + movq 16(%rdi),%r14 + movq 8(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 8(%rdi),%r12 + movq (%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 16(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq (%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,16(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq 24(%rsi),%rax + mulq %r11 + movq 24(%rdi),%r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq %r15,%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rsi),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rsi),%rax + mulq %r14 + addq %rax,%r10 + movq %r10,24(%rcx) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 24(%rsi),%rax + mulq %r12 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 8(%rsi),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rsi),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 24(%rsi),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rsi),%rax + mulq %r15 + addq %rax,%r9 + movq %r9,40(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 24(%rsi),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rcx) + adcq %rdx,%r8 + movq %r8,56(%rcx) + + + + movq 32(%rdi),%r11 + movq 32(%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,64(%rcx) + movq %rdx,%r8 + + movq 48(%rdi),%r14 + movq 40(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 40(%rdi),%r12 + movq 32(%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,72(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 48(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 32(%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 40(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,80(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 48(%rsi),%rax + mulq %r12 + xorq %r12,%r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + + movq 40(%rsi),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + movq %r10,88(%rcx) + + movq 48(%rsi),%rax + mulq %r14 + addq %rax,%r8 + adcq $0x0,%r12 + movq %r8,96(%rcx) + + addq %r12,%rdx + + + movq 0(%rsp),%r8 + subq 0(%rcx),%r8 + movq 8(%rsp),%r9 + sbbq 8(%rcx),%r9 + movq 16(%rsp),%r10 + sbbq 16(%rcx),%r10 + movq 24(%rsp),%r11 + sbbq 24(%rcx),%r11 + movq 80(%rsp),%r12 + sbbq 32(%rcx),%r12 + movq 88(%rsp),%r13 + sbbq 40(%rcx),%r13 + movq 96(%rsp),%r14 + sbbq 48(%rcx),%r14 + movq 104(%rsp),%r15 + sbbq 56(%rcx),%r15 + + + movq 64(%rcx),%rax + subq %rax,%r8 + movq 72(%rcx),%rax + sbbq %rax,%r9 + movq 80(%rcx),%rax + sbbq %rax,%r10 + movq 88(%rcx),%rax + sbbq %rax,%r11 + movq 96(%rcx),%rax + sbbq %rax,%r12 + sbbq %rdx,%r13 + sbbq $0x0,%r14 + sbbq $0x0,%r15 + + + addq 32(%rcx),%r8 + movq %r8,32(%rcx) + adcq 40(%rcx),%r9 + movq %r9,40(%rcx) + adcq 48(%rcx),%r10 + movq %r10,48(%rcx) + adcq 56(%rcx),%r11 + movq %r11,56(%rcx) + adcq 64(%rcx),%r12 + movq %r12,64(%rcx) + adcq 72(%rcx),%r13 + movq %r13,72(%rcx) + adcq 80(%rcx),%r14 + movq %r14,80(%rcx) + adcq 88(%rcx),%r15 + movq %r15,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + adcq $0x0,%rdx + movq %rdx,104(%rcx) + + addq $112,%rsp +.cfi_adjust_cfa_offset -112 + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S index e87467caf0..bc324888b6 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _ChaCha20_ctr32 .private_extern _ChaCha20_ctr32 diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S index 4046251d3a..3634f64d11 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .private_extern __x86_AES_encrypt_compact .align 4 @@ -961,11 +967,11 @@ LAES_Te: .long 16,32,64,128 .long 27,54,0,0 .long 0,0,0,0 -.globl _asm_AES_encrypt -.private_extern _asm_AES_encrypt +.globl _aes_nohw_encrypt +.private_extern _aes_nohw_encrypt .align 4 -_asm_AES_encrypt: -L_asm_AES_encrypt_begin: +_aes_nohw_encrypt: +L_aes_nohw_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2145,11 +2151,11 @@ LAES_Td: .byte 200,235,187,60,131,83,153,97 .byte 23,43,4,126,186,119,214,38 .byte 225,105,20,99,85,33,12,125 -.globl _asm_AES_decrypt -.private_extern _asm_AES_decrypt +.globl _aes_nohw_decrypt +.private_extern _aes_nohw_decrypt .align 4 -_asm_AES_decrypt: -L_asm_AES_decrypt_begin: +_aes_nohw_decrypt: +L_aes_nohw_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2209,11 +2215,11 @@ L011x86: popl %ebx popl %ebp ret -.globl _asm_AES_cbc_encrypt -.private_extern _asm_AES_cbc_encrypt +.globl _aes_nohw_cbc_encrypt +.private_extern _aes_nohw_cbc_encrypt .align 4 -_asm_AES_cbc_encrypt: -L_asm_AES_cbc_encrypt_begin: +_aes_nohw_cbc_encrypt: +L_aes_nohw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2970,18 +2976,18 @@ L045exit: popl %ebx popl %ebp ret -.globl _asm_AES_set_encrypt_key -.private_extern _asm_AES_set_encrypt_key +.globl _aes_nohw_set_encrypt_key +.private_extern _aes_nohw_set_encrypt_key .align 4 -_asm_AES_set_encrypt_key: -L_asm_AES_set_encrypt_key_begin: +_aes_nohw_set_encrypt_key: +L_aes_nohw_set_encrypt_key_begin: call __x86_AES_set_encrypt_key ret -.globl _asm_AES_set_decrypt_key -.private_extern _asm_AES_set_decrypt_key +.globl _aes_nohw_set_decrypt_key +.private_extern _aes_nohw_set_decrypt_key .align 4 -_asm_AES_set_decrypt_key: -L_asm_AES_set_decrypt_key_begin: +_aes_nohw_set_decrypt_key: +L_aes_nohw_set_decrypt_key_begin: call __x86_AES_set_encrypt_key cmpl $0,%eax je L054proceed diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S index 3fe0e7543f..db7efffdf8 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S @@ -1,10 +1,30 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text -.globl _aesni_encrypt -.private_extern _aesni_encrypt +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt .align 4 -_aesni_encrypt: -L_aesni_encrypt_begin: +_aes_hw_encrypt: +L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L000pic +L000pic: + popl %ebx + leal _BORINGSSL_function_hit+1-L000pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -14,23 +34,23 @@ L_aesni_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L000enc1_loop_1: +L001enc1_loop_1: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L000enc1_loop_1 + jnz L001enc1_loop_1 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movups %xmm2,(%eax) pxor %xmm2,%xmm2 ret -.globl _aesni_decrypt -.private_extern _aesni_decrypt +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt .align 4 -_aesni_decrypt: -L_aesni_decrypt_begin: +_aes_hw_decrypt: +L_aes_hw_decrypt_begin: movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -40,12 +60,12 @@ L_aesni_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L001dec1_loop_2: +L002dec1_loop_2: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L001dec1_loop_2 + jnz L002dec1_loop_2 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -64,7 +84,7 @@ __aesni_encrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L002enc2_loop: +L003enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -72,7 +92,7 @@ L002enc2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L002enc2_loop + jnz L003enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 @@ -90,7 +110,7 @@ __aesni_decrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L003dec2_loop: +L004dec2_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 movups (%edx,%ecx,1),%xmm1 @@ -98,7 +118,7 @@ L003dec2_loop: .byte 102,15,56,222,208 .byte 102,15,56,222,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L003dec2_loop + jnz L004dec2_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,223,208 @@ -117,7 +137,7 @@ __aesni_encrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L004enc3_loop: +L005enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -127,7 +147,7 @@ L004enc3_loop: .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%edx,%ecx,1),%xmm0 - jnz L004enc3_loop + jnz L005enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -148,7 +168,7 @@ __aesni_decrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L005dec3_loop: +L006dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -158,7 +178,7 @@ L005dec3_loop: .byte 102,15,56,222,216 .byte 102,15,56,222,224 movups -16(%edx,%ecx,1),%xmm0 - jnz L005dec3_loop + jnz L006dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -181,7 +201,7 @@ __aesni_encrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -L006enc4_loop: +L007enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -193,7 +213,7 @@ L006enc4_loop: .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%edx,%ecx,1),%xmm0 - jnz L006enc4_loop + jnz L007enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -218,7 +238,7 @@ __aesni_decrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -L007dec4_loop: +L008dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -230,7 +250,7 @@ L007dec4_loop: .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups -16(%edx,%ecx,1),%xmm0 - jnz L007dec4_loop + jnz L008dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -259,13 +279,13 @@ __aesni_encrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp L008_aesni_encrypt6_inner + jmp L009_aesni_encrypt6_inner .align 4,0x90 -L009enc6_loop: +L010enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 -L008_aesni_encrypt6_inner: +L009_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -279,7 +299,7 @@ L_aesni_encrypt6_enter: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L009enc6_loop + jnz L010enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -312,13 +332,13 @@ __aesni_decrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp L010_aesni_decrypt6_inner + jmp L011_aesni_decrypt6_inner .align 4,0x90 -L011dec6_loop: +L012dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 -L010_aesni_decrypt6_inner: +L011_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -332,7 +352,7 @@ L_aesni_decrypt6_enter: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L011dec6_loop + jnz L012dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -346,11 +366,11 @@ L_aesni_decrypt6_enter: .byte 102,15,56,223,240 .byte 102,15,56,223,248 ret -.globl _aesni_ecb_encrypt -.private_extern _aesni_ecb_encrypt +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt .align 4 -_aesni_ecb_encrypt: -L_aesni_ecb_encrypt_begin: +_aes_hw_ecb_encrypt: +L_aes_hw_ecb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -361,14 +381,14 @@ L_aesni_ecb_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz L012ecb_ret + jz L013ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz L013ecb_decrypt + jz L014ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L014ecb_enc_tail + jb L015ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -377,9 +397,9 @@ L_aesni_ecb_encrypt_begin: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L015ecb_enc_loop6_enter + jmp L016ecb_enc_loop6_enter .align 4,0x90 -L016ecb_enc_loop6: +L017ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -394,12 +414,12 @@ L016ecb_enc_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L015ecb_enc_loop6_enter: +L016ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L016ecb_enc_loop6 + jnc L017ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -408,18 +428,18 @@ L015ecb_enc_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L012ecb_ret -L014ecb_enc_tail: + jz L013ecb_ret +L015ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L017ecb_enc_one + jb L018ecb_enc_one movups 16(%esi),%xmm3 - je L018ecb_enc_two + je L019ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L019ecb_enc_three + jb L020ecb_enc_three movups 48(%esi),%xmm5 - je L020ecb_enc_four + je L021ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -428,49 +448,49 @@ L014ecb_enc_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L017ecb_enc_one: +L018ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L021enc1_loop_3: +L022enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L021enc1_loop_3 + jnz L022enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L018ecb_enc_two: +L019ecb_enc_two: call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L019ecb_enc_three: +L020ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L020ecb_enc_four: +L021ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L013ecb_decrypt: +L014ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L022ecb_dec_tail + jb L023ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -479,9 +499,9 @@ L013ecb_decrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L023ecb_dec_loop6_enter + jmp L024ecb_dec_loop6_enter .align 4,0x90 -L024ecb_dec_loop6: +L025ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -496,12 +516,12 @@ L024ecb_dec_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L023ecb_dec_loop6_enter: +L024ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L024ecb_dec_loop6 + jnc L025ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -510,18 +530,18 @@ L023ecb_dec_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L012ecb_ret -L022ecb_dec_tail: + jz L013ecb_ret +L023ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L025ecb_dec_one + jb L026ecb_dec_one movups 16(%esi),%xmm3 - je L026ecb_dec_two + je L027ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L027ecb_dec_three + jb L028ecb_dec_three movups 48(%esi),%xmm5 - je L028ecb_dec_four + je L029ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -530,43 +550,43 @@ L022ecb_dec_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L025ecb_dec_one: +L026ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L029dec1_loop_4: +L030dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L029dec1_loop_4 + jnz L030dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L026ecb_dec_two: +L027ecb_dec_two: call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L027ecb_dec_three: +L028ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L028ecb_dec_four: +L029ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L012ecb_ret: +L013ecb_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -580,11 +600,11 @@ L012ecb_ret: popl %ebx popl %ebp ret -.globl _aesni_ccm64_encrypt_blocks -.private_extern _aesni_ccm64_encrypt_blocks +.globl _aes_hw_ccm64_encrypt_blocks +.private_extern _aes_hw_ccm64_encrypt_blocks .align 4 -_aesni_ccm64_encrypt_blocks: -L_aesni_ccm64_encrypt_blocks_begin: +_aes_hw_ccm64_encrypt_blocks: +L_aes_hw_ccm64_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -620,7 +640,7 @@ L_aesni_ccm64_encrypt_blocks_begin: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -L030ccm64_enc_outer: +L031ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -629,7 +649,7 @@ L030ccm64_enc_outer: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -L031ccm64_enc2_loop: +L032ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -637,7 +657,7 @@ L031ccm64_enc2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L031ccm64_enc2_loop + jnz L032ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -650,7 +670,7 @@ L031ccm64_enc2_loop: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz L030ccm64_enc_outer + jnz L031ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -667,11 +687,11 @@ L031ccm64_enc2_loop: popl %ebx popl %ebp ret -.globl _aesni_ccm64_decrypt_blocks -.private_extern _aesni_ccm64_decrypt_blocks +.globl _aes_hw_ccm64_decrypt_blocks +.private_extern _aes_hw_ccm64_decrypt_blocks .align 4 -_aesni_ccm64_decrypt_blocks: -L_aesni_ccm64_decrypt_blocks_begin: +_aes_hw_ccm64_decrypt_blocks: +L_aes_hw_ccm64_decrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -708,12 +728,12 @@ L_aesni_ccm64_decrypt_blocks_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L032enc1_loop_5: +L033enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L032enc1_loop_5 + jnz L033enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -723,16 +743,16 @@ L032enc1_loop_5: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp L033ccm64_dec_outer + jmp L034ccm64_dec_outer .align 4,0x90 -L033ccm64_dec_outer: +L034ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz L034ccm64_dec_break + jz L035ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -740,7 +760,7 @@ L033ccm64_dec_outer: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -L035ccm64_dec2_loop: +L036ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -748,7 +768,7 @@ L035ccm64_dec2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L035ccm64_dec2_loop + jnz L036ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -756,9 +776,9 @@ L035ccm64_dec2_loop: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp L033ccm64_dec_outer + jmp L034ccm64_dec_outer .align 4,0x90 -L034ccm64_dec_break: +L035ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -766,12 +786,12 @@ L034ccm64_dec_break: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -L036enc1_loop_6: +L037enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L036enc1_loop_6 + jnz L037enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -789,15 +809,27 @@ L036enc1_loop_6: popl %ebx popl %ebp ret -.globl _aesni_ctr32_encrypt_blocks -.private_extern _aesni_ctr32_encrypt_blocks +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks .align 4 -_aesni_ctr32_encrypt_blocks: -L_aesni_ctr32_encrypt_blocks_begin: +_aes_hw_ctr32_encrypt_blocks: +L_aes_hw_ctr32_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L038pic +L038pic: + popl %ebx + leal _BORINGSSL_function_hit+0-L038pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax @@ -808,7 +840,7 @@ L_aesni_ctr32_encrypt_blocks_begin: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je L037ctr32_one_shortcut + je L039ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -846,7 +878,7 @@ L_aesni_ctr32_encrypt_blocks_begin: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb L038ctr32_tail + jb L040ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -855,9 +887,9 @@ L_aesni_ctr32_encrypt_blocks_begin: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp L039ctr32_loop6 + jmp L041ctr32_loop6 .align 4,0x90 -L039ctr32_loop6: +L041ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -911,27 +943,27 @@ L039ctr32_loop6: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc L039ctr32_loop6 + jnc L041ctr32_loop6 addl $6,%eax - jz L040ctr32_ret + jz L042ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -L038ctr32_tail: +L040ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb L041ctr32_one + jb L043ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je L042ctr32_two + je L044ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb L043ctr32_three + jb L045ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je L044ctr32_four + je L046ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -949,29 +981,29 @@ L038ctr32_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L037ctr32_one_shortcut: +L039ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -L041ctr32_one: +L043ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L045enc1_loop_7: +L047enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L045enc1_loop_7 + jnz L047enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L042ctr32_two: +L044ctr32_two: call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -979,9 +1011,9 @@ L042ctr32_two: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L043ctr32_three: +L045ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -992,9 +1024,9 @@ L043ctr32_three: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L044ctr32_four: +L046ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -1008,7 +1040,7 @@ L044ctr32_four: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L040ctr32_ret: +L042ctr32_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1026,11 +1058,11 @@ L040ctr32_ret: popl %ebx popl %ebp ret -.globl _aesni_xts_encrypt -.private_extern _aesni_xts_encrypt +.globl _aes_hw_xts_encrypt +.private_extern _aes_hw_xts_encrypt .align 4 -_aesni_xts_encrypt: -L_aesni_xts_encrypt_begin: +_aes_hw_xts_encrypt: +L_aes_hw_xts_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1043,12 +1075,12 @@ L_aesni_xts_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L046enc1_loop_8: +L048enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L046enc1_loop_8 + jnz L048enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1072,14 +1104,14 @@ L046enc1_loop_8: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc L047xts_enc_short + jc L049xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L048xts_enc_loop6 + jmp L050xts_enc_loop6 .align 4,0x90 -L048xts_enc_loop6: +L050xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1168,23 +1200,23 @@ L048xts_enc_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L048xts_enc_loop6 + jnc L050xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L047xts_enc_short: +L049xts_enc_short: addl $96,%eax - jz L049xts_enc_done6x + jz L051xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L050xts_enc_one + jb L052xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L051xts_enc_two + je L053xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1193,7 +1225,7 @@ L047xts_enc_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L052xts_enc_three + jb L054xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1203,7 +1235,7 @@ L047xts_enc_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L053xts_enc_four + je L055xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1235,9 +1267,9 @@ L047xts_enc_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L050xts_enc_one: +L052xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1245,20 +1277,20 @@ L050xts_enc_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L055enc1_loop_9: +L057enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L055enc1_loop_9 + jnz L057enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L051xts_enc_two: +L053xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1272,9 +1304,9 @@ L051xts_enc_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L052xts_enc_three: +L054xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1292,9 +1324,9 @@ L052xts_enc_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L053xts_enc_four: +L055xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1316,28 +1348,28 @@ L053xts_enc_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L049xts_enc_done6x: +L051xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz L056xts_enc_ret + jz L058xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp L057xts_enc_steal + jmp L059xts_enc_steal .align 4,0x90 -L054xts_enc_done: +L056xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L056xts_enc_ret + jz L058xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -L057xts_enc_steal: +L059xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1345,7 +1377,7 @@ L057xts_enc_steal: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L057xts_enc_steal + jnz L059xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1355,16 +1387,16 @@ L057xts_enc_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L058enc1_loop_10: +L060enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L058enc1_loop_10 + jnz L060enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -L056xts_enc_ret: +L058xts_enc_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1385,11 +1417,11 @@ L056xts_enc_ret: popl %ebx popl %ebp ret -.globl _aesni_xts_decrypt -.private_extern _aesni_xts_decrypt +.globl _aes_hw_xts_decrypt +.private_extern _aes_hw_xts_decrypt .align 4 -_aesni_xts_decrypt: -L_aesni_xts_decrypt_begin: +_aes_hw_xts_decrypt: +L_aes_hw_xts_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1402,12 +1434,12 @@ L_aesni_xts_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L059enc1_loop_11: +L061enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L059enc1_loop_11 + jnz L061enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1436,14 +1468,14 @@ L059enc1_loop_11: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc L060xts_dec_short + jc L062xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L061xts_dec_loop6 + jmp L063xts_dec_loop6 .align 4,0x90 -L061xts_dec_loop6: +L063xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1532,23 +1564,23 @@ L061xts_dec_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L061xts_dec_loop6 + jnc L063xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L060xts_dec_short: +L062xts_dec_short: addl $96,%eax - jz L062xts_dec_done6x + jz L064xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L063xts_dec_one + jb L065xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L064xts_dec_two + je L066xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1557,7 +1589,7 @@ L060xts_dec_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L065xts_dec_three + jb L067xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1567,7 +1599,7 @@ L060xts_dec_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L066xts_dec_four + je L068xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1599,9 +1631,9 @@ L060xts_dec_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L063xts_dec_one: +L065xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1609,20 +1641,20 @@ L063xts_dec_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L068dec1_loop_12: +L070dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L068dec1_loop_12 + jnz L070dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L064xts_dec_two: +L066xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1636,9 +1668,9 @@ L064xts_dec_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L065xts_dec_three: +L067xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1656,9 +1688,9 @@ L065xts_dec_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L066xts_dec_four: +L068xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1680,20 +1712,20 @@ L066xts_dec_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L062xts_dec_done6x: +L064xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz L069xts_dec_ret + jz L071xts_dec_ret movl %eax,112(%esp) - jmp L070xts_dec_only_one_more + jmp L072xts_dec_only_one_more .align 4,0x90 -L067xts_dec_done: +L069xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L069xts_dec_ret + jz L071xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1703,7 +1735,7 @@ L067xts_dec_done: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -L070xts_dec_only_one_more: +L072xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1717,16 +1749,16 @@ L070xts_dec_only_one_more: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L071dec1_loop_13: +L073dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L071dec1_loop_13 + jnz L073dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -L072xts_dec_steal: +L074xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1734,7 +1766,7 @@ L072xts_dec_steal: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L072xts_dec_steal + jnz L074xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1744,16 +1776,16 @@ L072xts_dec_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L073dec1_loop_14: +L075dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L073dec1_loop_14 + jnz L075dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -L069xts_dec_ret: +L071xts_dec_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1774,11 +1806,11 @@ L069xts_dec_ret: popl %ebx popl %ebp ret -.globl _aesni_cbc_encrypt -.private_extern _aesni_cbc_encrypt +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt .align 4 -_aesni_cbc_encrypt: -L_aesni_cbc_encrypt_begin: +_aes_hw_cbc_encrypt: +L_aes_hw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1792,7 +1824,7 @@ L_aesni_cbc_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz L074cbc_abort + jz L076cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1800,14 +1832,14 @@ L_aesni_cbc_encrypt_begin: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je L075cbc_decrypt + je L077cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb L076cbc_enc_tail + jb L078cbc_enc_tail subl $16,%eax - jmp L077cbc_enc_loop + jmp L079cbc_enc_loop .align 4,0x90 -L077cbc_enc_loop: +L079cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1815,25 +1847,25 @@ L077cbc_enc_loop: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -L078enc1_loop_15: +L080enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L078enc1_loop_15 + jnz L080enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc L077cbc_enc_loop + jnc L079cbc_enc_loop addl $16,%eax - jnz L076cbc_enc_tail + jnz L078cbc_enc_tail movaps %xmm2,%xmm7 pxor %xmm2,%xmm2 - jmp L079cbc_ret -L076cbc_enc_tail: + jmp L081cbc_ret +L078cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1844,20 +1876,20 @@ L076cbc_enc_tail: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp L077cbc_enc_loop + jmp L079cbc_enc_loop .align 4,0x90 -L075cbc_decrypt: +L077cbc_decrypt: cmpl $80,%eax - jbe L080cbc_dec_tail + jbe L082cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp L081cbc_dec_loop6_enter + jmp L083cbc_dec_loop6_enter .align 4,0x90 -L082cbc_dec_loop6: +L084cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -L081cbc_dec_loop6_enter: +L083cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1887,28 +1919,28 @@ L081cbc_dec_loop6_enter: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja L082cbc_dec_loop6 + ja L084cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle L083cbc_dec_clear_tail_collected + jle L085cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -L080cbc_dec_tail: +L082cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe L084cbc_dec_one + jbe L086cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe L085cbc_dec_two + jbe L087cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe L086cbc_dec_three + jbe L088cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe L087cbc_dec_four + jbe L089cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1935,26 +1967,26 @@ L080cbc_dec_tail: movaps %xmm6,%xmm2 pxor %xmm6,%xmm6 subl $80,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L084cbc_dec_one: +L086cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L089dec1_loop_16: +L091dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L089dec1_loop_16 + jnz L091dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L085cbc_dec_two: +L087cbc_dec_two: call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1964,9 +1996,9 @@ L085cbc_dec_two: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L086cbc_dec_three: +L088cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1979,9 +2011,9 @@ L086cbc_dec_three: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L087cbc_dec_four: +L089cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1999,21 +2031,21 @@ L087cbc_dec_four: movaps %xmm5,%xmm2 pxor %xmm5,%xmm5 subl $64,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L083cbc_dec_clear_tail_collected: +L085cbc_dec_clear_tail_collected: pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 -L088cbc_dec_tail_collected: +L090cbc_dec_tail_collected: andl $15,%eax - jnz L090cbc_dec_tail_partial + jnz L092cbc_dec_tail_partial movups %xmm2,(%edi) pxor %xmm0,%xmm0 - jmp L079cbc_ret + jmp L081cbc_ret .align 4,0x90 -L090cbc_dec_tail_partial: +L092cbc_dec_tail_partial: movaps %xmm2,(%esp) pxor %xmm0,%xmm0 movl $16,%ecx @@ -2021,14 +2053,14 @@ L090cbc_dec_tail_partial: subl %eax,%ecx .long 2767451785 movdqa %xmm2,(%esp) -L079cbc_ret: +L081cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp pxor %xmm2,%xmm2 pxor %xmm1,%xmm1 movups %xmm7,(%ebp) pxor %xmm7,%xmm7 -L074cbc_abort: +L076cbc_abort: popl %edi popl %esi popl %ebx @@ -2040,13 +2072,13 @@ __aesni_set_encrypt_key: pushl %ebp pushl %ebx testl %eax,%eax - jz L091bad_pointer + jz L093bad_pointer testl %edx,%edx - jz L091bad_pointer - call L092pic -L092pic: + jz L093bad_pointer + call L094pic +L094pic: popl %ebx - leal Lkey_const-L092pic(%ebx),%ebx + leal Lkey_const-L094pic(%ebx),%ebx movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 @@ -2054,45 +2086,45 @@ L092pic: leal 16(%edx),%edx andl $268437504,%ebp cmpl $256,%ecx - je L09314rounds + je L09514rounds cmpl $192,%ecx - je L09412rounds + je L09612rounds cmpl $128,%ecx - jne L095bad_keybits + jne L097bad_keybits .align 4,0x90 -L09610rounds: +L09810rounds: cmpl $268435456,%ebp - je L09710rounds_alt + je L09910rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call L098key_128_cold + call L100key_128_cold .byte 102,15,58,223,200,2 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,4 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,8 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,16 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,32 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,64 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,128 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,27 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,54 - call L099key_128 + call L101key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L099key_128: +L101key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -L098key_128_cold: +L100key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2101,13 +2133,13 @@ L098key_128_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L09710rounds_alt: +L09910rounds_alt: movdqa (%ebx),%xmm5 movl $8,%ecx movdqa 32(%ebx),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) -L101loop_key128: +L103loop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 @@ -2123,7 +2155,7 @@ L101loop_key128: movdqu %xmm0,-16(%edx) movdqa %xmm0,%xmm2 decl %ecx - jnz L101loop_key128 + jnz L103loop_key128 movdqa 48(%ebx),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 @@ -2151,41 +2183,41 @@ L101loop_key128: movdqu %xmm0,16(%edx) movl $9,%ecx movl %ecx,96(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L09412rounds: +L09612rounds: movq 16(%eax),%xmm2 cmpl $268435456,%ebp - je L10212rounds_alt + je L10412rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call L103key_192a_cold + call L105key_192a_cold .byte 102,15,58,223,202,2 - call L104key_192b + call L106key_192b .byte 102,15,58,223,202,4 - call L105key_192a + call L107key_192a .byte 102,15,58,223,202,8 - call L104key_192b + call L106key_192b .byte 102,15,58,223,202,16 - call L105key_192a + call L107key_192a .byte 102,15,58,223,202,32 - call L104key_192b + call L106key_192b .byte 102,15,58,223,202,64 - call L105key_192a + call L107key_192a .byte 102,15,58,223,202,128 - call L104key_192b + call L106key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L105key_192a: +L107key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 4,0x90 -L103key_192a_cold: +L105key_192a_cold: movaps %xmm2,%xmm5 -L106key_192b_warm: +L108key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2199,21 +2231,21 @@ L106key_192b_warm: pxor %xmm3,%xmm2 ret .align 4,0x90 -L104key_192b: +L106key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp L106key_192b_warm + jmp L108key_192b_warm .align 4,0x90 -L10212rounds_alt: +L10412rounds_alt: movdqa 16(%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $8,%ecx movdqu %xmm0,-16(%edx) -L107loop_key192: +L109loop_key192: movq %xmm2,(%edx) movdqa %xmm2,%xmm1 .byte 102,15,56,0,213 @@ -2235,54 +2267,54 @@ L107loop_key192: pxor %xmm3,%xmm2 movdqu %xmm0,-16(%edx) decl %ecx - jnz L107loop_key192 + jnz L109loop_key192 movl $11,%ecx movl %ecx,32(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L09314rounds: +L09514rounds: movups 16(%eax),%xmm2 leal 16(%edx),%edx cmpl $268435456,%ebp - je L10814rounds_alt + je L11014rounds_alt movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call L109key_256a_cold + call L111key_256a_cold .byte 102,15,58,223,200,1 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,2 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,2 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,4 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,4 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,8 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,8 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,16 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,16 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,32 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,32 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,64 - call L111key_256a + call L113key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - jmp L100good_key + jmp L102good_key .align 4,0x90 -L111key_256a: +L113key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -L109key_256a_cold: +L111key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2291,7 +2323,7 @@ L109key_256a_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L110key_256b: +L112key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2302,14 +2334,14 @@ L110key_256b: xorps %xmm1,%xmm2 ret .align 4,0x90 -L10814rounds_alt: +L11014rounds_alt: movdqa (%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $7,%ecx movdqu %xmm0,-32(%edx) movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) -L112loop_key256: +L114loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 @@ -2323,7 +2355,7 @@ L112loop_key256: pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) decl %ecx - jz L113done_key256 + jz L115done_key256 pshufd $255,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -2338,11 +2370,11 @@ L112loop_key256: movdqu %xmm2,16(%edx) leal 32(%edx),%edx movdqa %xmm2,%xmm1 - jmp L112loop_key256 -L113done_key256: + jmp L114loop_key256 +L115done_key256: movl $13,%ecx movl %ecx,16(%edx) -L100good_key: +L102good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2354,33 +2386,45 @@ L100good_key: popl %ebp ret .align 2,0x90 -L091bad_pointer: +L093bad_pointer: movl $-1,%eax popl %ebx popl %ebp ret .align 2,0x90 -L095bad_keybits: +L097bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx popl %ebp ret -.globl _aesni_set_encrypt_key -.private_extern _aesni_set_encrypt_key +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key .align 4 -_aesni_set_encrypt_key: -L_aesni_set_encrypt_key_begin: +_aes_hw_set_encrypt_key: +L_aes_hw_set_encrypt_key_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L116pic +L116pic: + popl %ebx + leal _BORINGSSL_function_hit+3-L116pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx call __aesni_set_encrypt_key ret -.globl _aesni_set_decrypt_key -.private_extern _aesni_set_decrypt_key +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key .align 4 -_aesni_set_decrypt_key: -L_aesni_set_decrypt_key_begin: +_aes_hw_set_decrypt_key: +L_aes_hw_set_decrypt_key_begin: movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -2388,7 +2432,7 @@ L_aesni_set_decrypt_key_begin: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz L114dec_key_ret + jnz L117dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2396,7 +2440,7 @@ L_aesni_set_decrypt_key_begin: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -L115dec_key_inverse: +L118dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2406,14 +2450,14 @@ L115dec_key_inverse: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja L115dec_key_inverse + ja L118dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 xorl %eax,%eax -L114dec_key_ret: +L117dec_key_ret: ret .align 6,0x90 Lkey_const: diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S index d1be040546..7d0462b51f 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _bn_mul_add_words .private_extern _bn_mul_add_words diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S index 858ba3743e..578ca70b0c 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _bn_mul_comba8 .private_extern _bn_mul_comba8 diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S new file mode 100644 index 0000000000..f059e2839a --- /dev/null +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S @@ -0,0 +1,289 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.align 4 +_gcm_gmult_ssse3: +L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call L000pic_point +L000pic_point: + popl %eax + movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7 + movdqa Llow4_mask-L000pic_point(%eax),%xmm2 +.byte 102,15,56,0,199 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L001loop_row_1: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L002loop_row_2: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L003loop_row_3: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,0,215 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.align 4 +_gcm_ghash_ssse3: +L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call L004pic_point +L004pic_point: + popl %ebx + movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7 + andl $-16,%ecx +.byte 102,15,56,0,199 + pxor %xmm3,%xmm3 +L005loop_ghash: + movdqa Llow4_mask-L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 +.byte 102,15,56,0,207 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +L006loop_row_4: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L007loop_row_5: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L008loop_row_6: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz L005loop_ghash +.byte 102,15,56,0,199 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 4,0x90 +Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 4,0x90 +Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S index 320cd42b1a..e13bf3e858 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _gcm_gmult_4bit_mmx .private_extern _gcm_gmult_4bit_mmx diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S index 795e42e5c8..391acbd123 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _md5_block_asm_data_order .private_extern _md5_block_asm_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S index efb6f52e32..89c5d168e5 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _sha1_block_data_order .private_extern _sha1_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S index 7f15397e15..a974488943 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _sha256_block_data_order .private_extern _sha256_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S index f65cb1086a..a08e6ef5d7 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _sha512_block_data_order .private_extern _sha512_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S index f49e9f0a81..6b5a88b304 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S @@ -1,5 +1,13 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text +#ifdef BORINGSSL_DISPATCH_TEST +#endif .align 6,0x90 L_vpaes_consts: .long 218628480,235210255,168496130,67568393 @@ -460,6 +468,18 @@ L_vpaes_set_encrypt_key_begin: pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L016pic +L016pic: + popl %ebx + leal _BORINGSSL_function_hit+5-L016pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%eax @@ -473,9 +493,9 @@ L_vpaes_set_encrypt_key_begin: movl %ebx,240(%edx) movl $48,%ecx movl $0,%edi - leal L_vpaes_consts+0x30-L016pic_point,%ebp + leal L_vpaes_consts+0x30-L017pic_point,%ebp call __vpaes_schedule_core -L016pic_point: +L017pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -510,9 +530,9 @@ L_vpaes_set_decrypt_key_begin: shrl $1,%ecx andl $32,%ecx xorl $32,%ecx - leal L_vpaes_consts+0x30-L017pic_point,%ebp + leal L_vpaes_consts+0x30-L018pic_point,%ebp call __vpaes_schedule_core -L017pic_point: +L018pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -529,9 +549,21 @@ L_vpaes_encrypt_begin: pushl %ebx pushl %esi pushl %edi - leal L_vpaes_consts+0x30-L018pic_point,%ebp +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L019pic +L019pic: + popl %ebx + leal _BORINGSSL_function_hit+4-L019pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal L_vpaes_consts+0x30-L020pic_point,%ebp call __vpaes_preheat -L018pic_point: +L020pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -557,9 +589,9 @@ L_vpaes_decrypt_begin: pushl %ebx pushl %esi pushl %edi - leal L_vpaes_consts+0x30-L019pic_point,%ebp + leal L_vpaes_consts+0x30-L021pic_point,%ebp call __vpaes_preheat -L019pic_point: +L021pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -590,7 +622,7 @@ L_vpaes_cbc_encrypt_begin: movl 28(%esp),%eax movl 32(%esp),%edx subl $16,%eax - jc L020cbc_abort + jc L022cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -603,14 +635,14 @@ L_vpaes_cbc_encrypt_begin: movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal L_vpaes_consts+0x30-L021pic_point,%ebp + leal L_vpaes_consts+0x30-L023pic_point,%ebp call __vpaes_preheat -L021pic_point: +L023pic_point: cmpl $0,%ecx - je L022cbc_dec_loop - jmp L023cbc_enc_loop + je L024cbc_dec_loop + jmp L025cbc_enc_loop .align 4,0x90 -L023cbc_enc_loop: +L025cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call __vpaes_encrypt_core @@ -620,10 +652,10 @@ L023cbc_enc_loop: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc L023cbc_enc_loop - jmp L024cbc_done + jnc L025cbc_enc_loop + jmp L026cbc_done .align 4,0x90 -L022cbc_dec_loop: +L024cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -635,12 +667,12 @@ L022cbc_dec_loop: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc L022cbc_dec_loop -L024cbc_done: + jnc L024cbc_dec_loop +L026cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) -L020cbc_abort: +L022cbc_abort: popl %edi popl %esi popl %ebx diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S index e7353ae252..3ef8774ed5 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _bn_mul_mont .private_extern _bn_mul_mont @@ -445,16 +451,18 @@ L017sub: leal 1(%edx),%edx jge L017sub sbbl $0,%eax - andl %eax,%esi - notl %eax - movl %edi,%ebp - andl %eax,%ebp - orl %ebp,%esi + movl $-1,%edx + xorl %eax,%edx + jmp L018copy .align 4,0x90 L018copy: - movl (%esi,%ebx,4),%eax - movl %eax,(%edi,%ebx,4) + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) decl %ebx jge L018copy movl 24(%esp),%esp diff --git a/packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S b/packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S new file mode 100644 index 0000000000..601f2f0151 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S @@ -0,0 +1,169 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +L_abi_test_trampoline_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%ecx + movl (%ecx),%esi + movl 4(%ecx),%edi + movl 8(%ecx),%ebx + movl 12(%ecx),%ebp + subl $44,%esp + movl 72(%esp),%eax + xorl %ecx,%ecx +L000loop: + cmpl 76(%esp),%ecx + jae L001loop_done + movl (%eax,%ecx,4),%edx + movl %edx,(%esp,%ecx,4) + addl $1,%ecx + jmp L000loop +L001loop_done: + call *64(%esp) + addl $44,%esp + movl 24(%esp),%ecx + movl %esi,(%ecx) + movl %edi,4(%ecx) + movl %ebx,8(%ecx) + movl %ebp,12(%ecx) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _abi_test_get_and_clear_direction_flag +.private_extern _abi_test_get_and_clear_direction_flag +.align 4 +_abi_test_get_and_clear_direction_flag: +L_abi_test_get_and_clear_direction_flag_begin: + pushfl + popl %eax + andl $1024,%eax + shrl $10,%eax + cld + ret +.globl _abi_test_set_direction_flag +.private_extern _abi_test_set_direction_flag +.align 4 +_abi_test_set_direction_flag: +L_abi_test_set_direction_flag_begin: + std + ret +.globl _abi_test_clobber_eax +.private_extern _abi_test_clobber_eax +.align 4 +_abi_test_clobber_eax: +L_abi_test_clobber_eax_begin: + xorl %eax,%eax + ret +.globl _abi_test_clobber_ebx +.private_extern _abi_test_clobber_ebx +.align 4 +_abi_test_clobber_ebx: +L_abi_test_clobber_ebx_begin: + xorl %ebx,%ebx + ret +.globl _abi_test_clobber_ecx +.private_extern _abi_test_clobber_ecx +.align 4 +_abi_test_clobber_ecx: +L_abi_test_clobber_ecx_begin: + xorl %ecx,%ecx + ret +.globl _abi_test_clobber_edx +.private_extern _abi_test_clobber_edx +.align 4 +_abi_test_clobber_edx: +L_abi_test_clobber_edx_begin: + xorl %edx,%edx + ret +.globl _abi_test_clobber_edi +.private_extern _abi_test_clobber_edi +.align 4 +_abi_test_clobber_edi: +L_abi_test_clobber_edi_begin: + xorl %edi,%edi + ret +.globl _abi_test_clobber_esi +.private_extern _abi_test_clobber_esi +.align 4 +_abi_test_clobber_esi: +L_abi_test_clobber_esi_begin: + xorl %esi,%esi + ret +.globl _abi_test_clobber_ebp +.private_extern _abi_test_clobber_ebp +.align 4 +_abi_test_clobber_ebp: +L_abi_test_clobber_ebp_begin: + xorl %ebp,%ebp + ret +.globl _abi_test_clobber_xmm0 +.private_extern _abi_test_clobber_xmm0 +.align 4 +_abi_test_clobber_xmm0: +L_abi_test_clobber_xmm0_begin: + pxor %xmm0,%xmm0 + ret +.globl _abi_test_clobber_xmm1 +.private_extern _abi_test_clobber_xmm1 +.align 4 +_abi_test_clobber_xmm1: +L_abi_test_clobber_xmm1_begin: + pxor %xmm1,%xmm1 + ret +.globl _abi_test_clobber_xmm2 +.private_extern _abi_test_clobber_xmm2 +.align 4 +_abi_test_clobber_xmm2: +L_abi_test_clobber_xmm2_begin: + pxor %xmm2,%xmm2 + ret +.globl _abi_test_clobber_xmm3 +.private_extern _abi_test_clobber_xmm3 +.align 4 +_abi_test_clobber_xmm3: +L_abi_test_clobber_xmm3_begin: + pxor %xmm3,%xmm3 + ret +.globl _abi_test_clobber_xmm4 +.private_extern _abi_test_clobber_xmm4 +.align 4 +_abi_test_clobber_xmm4: +L_abi_test_clobber_xmm4_begin: + pxor %xmm4,%xmm4 + ret +.globl _abi_test_clobber_xmm5 +.private_extern _abi_test_clobber_xmm5 +.align 4 +_abi_test_clobber_xmm5: +L_abi_test_clobber_xmm5_begin: + pxor %xmm5,%xmm5 + ret +.globl _abi_test_clobber_xmm6 +.private_extern _abi_test_clobber_xmm6 +.align 4 +_abi_test_clobber_xmm6: +L_abi_test_clobber_xmm6_begin: + pxor %xmm6,%xmm6 + ret +.globl _abi_test_clobber_xmm7 +.private_extern _abi_test_clobber_xmm7 +.align 4 +_abi_test_clobber_xmm7: +L_abi_test_clobber_xmm7_begin: + pxor %xmm7,%xmm7 + ret +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S index 30edc7b5e2..10b1ad9520 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -37,6 +49,7 @@ L$sixteen: .p2align 6 _ChaCha20_ctr32: + cmpq $0,%rdx je L$no_data movq _OPENSSL_ia32cap_P+4(%rip),%r10 @@ -44,12 +57,19 @@ _ChaCha20_ctr32: jnz L$ChaCha20_ssse3 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $64+24,%rsp + L$ctr32_body: @@ -290,20 +310,30 @@ L$oop_tail: L$done: leaq 64+24+48(%rsp),%rsi movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$no_data: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_ssse3: L$ChaCha20_ssse3: + movq %rsp,%r9 + cmpq $128,%rdx ja L$ChaCha20_4x @@ -429,14 +459,18 @@ L$oop_tail_ssse3: L$done_ssse3: leaq (%r9),%rsp + L$ssse3_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_4x: L$ChaCha20_4x: + movq %rsp,%r9 + movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -977,14 +1011,18 @@ L$oop_tail4x: L$done4x: leaq (%r9),%rsp + L$4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_8x: L$ChaCha20_8x: + movq %rsp,%r9 + subq $0x280+8,%rsp andq $-32,%rsp vzeroupper @@ -1579,7 +1617,9 @@ L$oop_tail8x: L$done8x: vzeroall leaq (%r9),%rsp + L$8x_epilogue: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S index c8a5262c8d..0c921b37b5 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .data .p2align 4 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index c90447ac45..e50227ae38 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S index c7c4829fa0..8875d0abbb 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .p2align 4 @@ -156,6 +168,7 @@ L$enc_loop: .p2align 4 _x86_64_AES_encrypt_compact: + leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -326,21 +339,30 @@ L$enc_compact_done: xorl 12(%r15),%edx .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_encrypt -.private_extern _asm_AES_encrypt -.private_extern _asm_AES_encrypt -_asm_AES_encrypt: +.p2align 4 +.globl _aes_nohw_encrypt +.private_extern _aes_nohw_encrypt + +.private_extern _aes_nohw_encrypt +_aes_nohw_encrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -351,6 +373,7 @@ _asm_AES_encrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) + L$enc_prologue: movq %rdx,%r15 @@ -377,22 +400,31 @@ L$enc_prologue: movq 16(%rsp),%r9 movq 24(%rsp),%rsi + movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$enc_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_AES_decrypt: xorl 0(%r15),%eax @@ -550,6 +582,7 @@ L$dec_loop: .p2align 4 _x86_64_AES_decrypt_compact: + leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -772,21 +805,30 @@ L$dec_compact_done: xorl 12(%r15),%edx .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_decrypt -.private_extern _asm_AES_decrypt -.private_extern _asm_AES_decrypt -_asm_AES_decrypt: +.p2align 4 +.globl _aes_nohw_decrypt +.private_extern _aes_nohw_decrypt + +.private_extern _aes_nohw_decrypt +_aes_nohw_decrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -797,6 +839,7 @@ _asm_AES_decrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) + L$dec_prologue: movq %rdx,%r15 @@ -825,47 +868,69 @@ L$dec_prologue: movq 16(%rsp),%r9 movq 24(%rsp),%rsi + movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$dec_epilogue: .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_set_encrypt_key -.private_extern _asm_AES_set_encrypt_key -_asm_AES_set_encrypt_key: +.p2align 4 +.globl _aes_nohw_set_encrypt_key +.private_extern _aes_nohw_set_encrypt_key + +_aes_nohw_set_encrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8,%rsp + L$enc_key_prologue: call _x86_64_AES_set_encrypt_key movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp + L$enc_key_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_AES_set_encrypt_key: + movl %esi,%ecx movq %rdi,%rsi movq %rdx,%rdi @@ -1102,18 +1167,27 @@ L$badpointer: L$exit: .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_set_decrypt_key -.private_extern _asm_AES_set_decrypt_key -_asm_AES_set_decrypt_key: +.p2align 4 +.globl _aes_nohw_set_decrypt_key +.private_extern _aes_nohw_set_decrypt_key + +_aes_nohw_set_decrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdx + L$dec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1281,31 +1355,49 @@ L$permute: xorq %rax,%rax L$abort: movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp + L$dec_key_epilogue: .byte 0xf3,0xc3 + .p2align 4 -.globl _asm_AES_cbc_encrypt -.private_extern _asm_AES_cbc_encrypt +.globl _aes_nohw_cbc_encrypt +.private_extern _aes_nohw_cbc_encrypt -.private_extern _asm_AES_cbc_encrypt -_asm_AES_cbc_encrypt: +.private_extern _aes_nohw_cbc_encrypt +_aes_nohw_cbc_encrypt: + cmpq $0,%rdx je L$cbc_epilogue pushfq + + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$cbc_prologue: cld @@ -1316,6 +1408,7 @@ L$cbc_prologue: cmpq $0,%r9 cmoveq %r10,%r14 + leaq _OPENSSL_ia32cap_P(%rip),%r10 movl (%r10),%r10d cmpq $512,%rdx @@ -1352,7 +1445,9 @@ L$cbc_te_ok: xchgq %rsp,%r15 + movq %r15,16(%rsp) + L$cbc_fast_body: movq %rdi,24(%rsp) movq %rsi,32(%rsp) @@ -1551,6 +1646,7 @@ L$cbc_fast_cleanup: .p2align 4 L$cbc_slow_prologue: + leaq -88(%rsp),%rbp andq $-64,%rbp @@ -1562,7 +1658,9 @@ L$cbc_slow_prologue: xchgq %rsp,%rbp + movq %rbp,16(%rsp) + L$cbc_slow_body: @@ -1734,18 +1832,30 @@ L$cbc_slow_dec_partial: .p2align 4 L$cbc_exit: movq 16(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp + L$cbc_popfq: popfq + + + L$cbc_epilogue: .byte 0xf3,0xc3 + .p2align 6 L$AES_Te: .long 0xa56363c6,0xa56363c6 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S index 2513904cf1..b08a2fbbf9 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -544,6 +556,10 @@ L$handle_ctr32_2: .p2align 5 _aesni_gcm_encrypt: +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+2(%rip) +#endif xorq %r10,%r10 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S index 4ee0dc49c2..58e072ee1b 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -1,11 +1,28 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text -.globl _aesni_encrypt -.private_extern _aesni_encrypt +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt .p2align 4 -_aesni_encrypt: +_aes_hw_encrypt: + +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+1(%rip) +#endif movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -26,11 +43,13 @@ L$oop_enc1_1: .byte 0xf3,0xc3 -.globl _aesni_decrypt -.private_extern _aesni_decrypt + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt .p2align 4 -_aesni_decrypt: +_aes_hw_decrypt: + movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -51,8 +70,10 @@ L$oop_dec1_2: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -80,8 +101,10 @@ L$enc_loop2: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -109,8 +132,10 @@ L$dec_loop2: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -143,8 +168,10 @@ L$enc_loop3: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -177,8 +204,10 @@ L$dec_loop3: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -217,8 +246,10 @@ L$enc_loop4: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -257,8 +288,10 @@ L$dec_loop4: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -311,8 +344,10 @@ L$enc_loop6_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -365,8 +400,10 @@ L$dec_loop6_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -429,8 +466,10 @@ L$enc_loop8_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -492,11 +531,13 @@ L$dec_loop8_enter: .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 -.globl _aesni_ecb_encrypt -.private_extern _aesni_ecb_encrypt + +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt .p2align 4 -_aesni_ecb_encrypt: +_aes_hw_ecb_encrypt: + andq $-16,%rdx jz L$ecb_ret @@ -835,173 +876,16 @@ L$ecb_ret: pxor %xmm1,%xmm1 .byte 0xf3,0xc3 -.globl _aesni_ccm64_encrypt_blocks -.private_extern _aesni_ccm64_encrypt_blocks + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks .p2align 4 -_aesni_ccm64_encrypt_blocks: - movl 240(%rcx),%eax - movdqu (%r8),%xmm6 - movdqa L$increment64(%rip),%xmm9 - movdqa L$bswap_mask(%rip),%xmm7 +_aes_hw_ctr32_encrypt_blocks: - shll $4,%eax - movl $16,%r10d - leaq 0(%rcx),%r11 - movdqu (%r9),%xmm3 - movdqa %xmm6,%xmm2 - leaq 32(%rcx,%rax,1),%rcx -.byte 102,15,56,0,247 - subq %rax,%r10 - jmp L$ccm64_enc_outer -.p2align 4 -L$ccm64_enc_outer: - movups (%r11),%xmm0 - movq %r10,%rax - movups (%rdi),%xmm8 - - xorps %xmm0,%xmm2 - movups 16(%r11),%xmm1 - xorps %xmm8,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%r11),%xmm0 - -L$ccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq %xmm9,%xmm6 - decq %rdx -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - - leaq 16(%rdi),%rdi - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) -.byte 102,15,56,0,215 - leaq 16(%rsi),%rsi - jnz L$ccm64_enc_outer - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - -.globl _aesni_ccm64_decrypt_blocks -.private_extern _aesni_ccm64_decrypt_blocks - -.p2align 4 -_aesni_ccm64_decrypt_blocks: - movl 240(%rcx),%eax - movups (%r8),%xmm6 - movdqu (%r9),%xmm3 - movdqa L$increment64(%rip),%xmm9 - movdqa L$bswap_mask(%rip),%xmm7 - - movaps %xmm6,%xmm2 - movl %eax,%r10d - movq %rcx,%r11 -.byte 102,15,56,0,247 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_5: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_5 -.byte 102,15,56,221,209 - shll $4,%r10d - movl $16,%eax - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 - leaq 16(%rdi),%rdi - subq %r10,%rax - leaq 32(%r11,%r10,1),%rcx - movq %rax,%r10 - jmp L$ccm64_dec_outer -.p2align 4 -L$ccm64_dec_outer: - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi -.byte 102,15,56,0,215 - - subq $1,%rdx - jz L$ccm64_dec_break - - movups (%r11),%xmm0 - movq %r10,%rax - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - xorps %xmm0,%xmm2 - xorps %xmm8,%xmm3 - movups 32(%r11),%xmm0 - jmp L$ccm64_dec2_loop -.p2align 4 -L$ccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ccm64_dec2_loop - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leaq 16(%rdi),%rdi - jmp L$ccm64_dec_outer - -.p2align 4 -L$ccm64_dec_break: - - movl 240(%r11),%eax - movups (%r11),%xmm0 - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%r11),%r11 - xorps %xmm8,%xmm3 -L$oop_enc1_6: -.byte 102,15,56,220,217 - decl %eax - movups (%r11),%xmm1 - leaq 16(%r11),%r11 - jnz L$oop_enc1_6 -.byte 102,15,56,221,217 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - -.globl _aesni_ctr32_encrypt_blocks -.private_extern _aesni_ctr32_encrypt_blocks - -.p2align 4 -_aesni_ctr32_encrypt_blocks: +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit(%rip) +#endif cmpq $1,%rdx jne L$ctr32_bulk @@ -1014,12 +898,12 @@ _aesni_ctr32_encrypt_blocks: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_enc1_7: +L$oop_enc1_5: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_7 + jnz L$oop_enc1_5 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -1032,7 +916,9 @@ L$oop_enc1_7: .p2align 4 L$ctr32_bulk: leaq (%rsp),%r11 + pushq %rbp + subq $128,%rsp andq $-16,%rsp @@ -1567,1798 +1453,19 @@ L$ctr32_done: movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 movq -8(%r11),%rbp + leaq (%r11),%rsp + L$ctr32_epilogue: .byte 0xf3,0xc3 -.globl _aesni_xts_encrypt -.private_extern _aesni_xts_encrypt + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt .p2align 4 -_aesni_xts_encrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -L$oop_enc1_8: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz L$oop_enc1_8 -.byte 102,15,56,221,209 - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx +_aes_hw_cbc_encrypt: - movups 16(%rcx,%r10,1),%xmm1 - - movdqa L$xts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc L$xts_enc_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq L$xts_magic(%rip),%r8 - jmp L$xts_enc_grandloop - -.p2align 5 -L$xts_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,220,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,220,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,220,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,220,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,220,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp L$xts_enc_loop6 -.p2align 5 -L$xts_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz L$xts_enc_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,220,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,220,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,220,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,220,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,220,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,221,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,221,92,36,16 -.byte 102,15,56,221,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,221,108,36,48 -.byte 102,15,56,221,116,36,64 -.byte 102,15,56,221,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc L$xts_enc_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -L$xts_enc_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - addq $96,%rdx - jz L$xts_enc_done - - pxor %xmm0,%xmm11 - cmpq $0x20,%rdx - jb L$xts_enc_one - pxor %xmm0,%xmm12 - je L$xts_enc_two - - pxor %xmm0,%xmm13 - cmpq $0x40,%rdx - jb L$xts_enc_three - pxor %xmm0,%xmm14 - je L$xts_enc_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm7 - - call _aesni_encrypt6 - - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_9: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_9 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_encrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_encrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_encrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_done: - andq $15,%r9 - jz L$xts_enc_ret - movq %r9,%rdx - -L$xts_enc_steal: - movzbl (%rdi),%eax - movzbl -16(%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,-16(%rsi) - movb %cl,0(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz L$xts_enc_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups -16(%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_10: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_10 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,-16(%rsi) - -L$xts_enc_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -L$xts_enc_epilogue: - .byte 0xf3,0xc3 - -.globl _aesni_xts_decrypt -.private_extern _aesni_xts_decrypt - -.p2align 4 -_aesni_xts_decrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -L$oop_enc1_11: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz L$oop_enc1_11 -.byte 102,15,56,221,209 - xorl %eax,%eax - testq $15,%rdx - setnz %al - shlq $4,%rax - subq %rax,%rdx - - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa L$xts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc L$xts_dec_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq L$xts_magic(%rip),%r8 - jmp L$xts_dec_grandloop - -.p2align 5 -L$xts_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,222,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,222,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,222,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,222,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,222,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp L$xts_dec_loop6 -.p2align 5 -L$xts_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz L$xts_dec_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,222,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,222,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,222,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,222,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,222,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,223,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,223,92,36,16 -.byte 102,15,56,223,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,223,108,36,48 -.byte 102,15,56,223,116,36,64 -.byte 102,15,56,223,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc L$xts_dec_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -L$xts_dec_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 - addq $96,%rdx - jz L$xts_dec_done - - pxor %xmm0,%xmm12 - cmpq $0x20,%rdx - jb L$xts_dec_one - pxor %xmm0,%xmm13 - je L$xts_dec_two - - pxor %xmm0,%xmm14 - cmpq $0x40,%rdx - jb L$xts_dec_three - je L$xts_dec_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - - call _aesni_decrypt6 - - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm14 - movdqu %xmm5,48(%rsi) - pcmpgtd %xmm15,%xmm14 - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - pshufd $0x13,%xmm14,%xmm11 - andq $15,%r9 - jz L$xts_dec_ret - - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm11 - pxor %xmm15,%xmm11 - jmp L$xts_dec_done2 - -.p2align 4 -L$xts_dec_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_12: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_12 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - movdqa %xmm12,%xmm11 - leaq 16(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_decrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm13,%xmm11 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_decrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm14,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_decrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqa %xmm15,%xmm11 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_done: - andq $15,%r9 - jz L$xts_dec_ret -L$xts_dec_done2: - movq %r9,%rdx - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rdi),%xmm2 - xorps %xmm11,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_13: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_13 -.byte 102,15,56,223,209 - xorps %xmm11,%xmm2 - movups %xmm2,(%rsi) - -L$xts_dec_steal: - movzbl 16(%rdi),%eax - movzbl (%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,(%rsi) - movb %cl,16(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz L$xts_dec_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_14: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_14 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - -L$xts_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -L$xts_dec_epilogue: - .byte 0xf3,0xc3 - -.globl _aesni_ocb_encrypt -.private_extern _aesni_ocb_encrypt - -.p2align 5 -_aesni_ocb_encrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz L$ocb_enc_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - subq $1,%rdx - jz L$ocb_enc_done - -L$ocb_enc_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc L$ocb_enc_short - jmp L$ocb_enc_grandloop - -.p2align 5 -L$ocb_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_encrypt6 - - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc L$ocb_enc_grandloop - -L$ocb_enc_short: - addq $6,%rdx - jz L$ocb_enc_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb L$ocb_enc_one - movdqu 16(%rdi),%xmm3 - je L$ocb_enc_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb L$ocb_enc_three - movdqu 48(%rdi),%xmm5 - je L$ocb_enc_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_encrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_one: - movdqa %xmm10,%xmm7 - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_three: - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_four: - call __ocb_encrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - -L$ocb_enc_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$ocb_enc_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm6,%xmm8 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm8 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp L$ocb_enc_loop6 - -.p2align 5 -L$ocb_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,221,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups 64(%r11),%xmm0 - jmp L$ocb_enc_loop4 - -.p2align 5 -L$ocb_enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,221,210 -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm2,%xmm8 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,220,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,220,208 - movups 64(%r11),%xmm0 - jmp L$ocb_enc_loop1 - -.p2align 5 -L$ocb_enc_loop1: -.byte 102,15,56,220,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop1 - -.byte 102,15,56,220,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,221,215 - .byte 0xf3,0xc3 - - -.globl _aesni_ocb_decrypt -.private_extern _aesni_ocb_decrypt - -.p2align 5 -_aesni_ocb_decrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz L$ocb_dec_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm8 - leaq 16(%rsi),%rsi - subq $1,%rdx - jz L$ocb_dec_done - -L$ocb_dec_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc L$ocb_dec_short - jmp L$ocb_dec_grandloop - -.p2align 5 -L$ocb_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_decrypt6 - - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm8 - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc L$ocb_dec_grandloop - -L$ocb_dec_short: - addq $6,%rdx - jz L$ocb_dec_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb L$ocb_dec_one - movdqu 16(%rdi),%xmm3 - je L$ocb_dec_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb L$ocb_dec_three - movdqu 48(%rdi),%xmm5 - je L$ocb_dec_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_decrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_one: - movdqa %xmm10,%xmm7 - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_three: - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - movups %xmm4,32(%rsi) - xorps %xmm4,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_four: - call __ocb_decrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - -L$ocb_dec_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$ocb_dec_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp L$ocb_dec_loop6 - -.p2align 5 -L$ocb_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,223,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 -.byte 102,65,15,56,223,255 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups 64(%r11),%xmm0 - jmp L$ocb_dec_loop4 - -.p2align 5 -L$ocb_dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,223,210 -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,222,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,222,208 - movups 64(%r11),%xmm0 - jmp L$ocb_dec_loop1 - -.p2align 5 -L$ocb_dec_loop1: -.byte 102,15,56,222,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop1 - -.byte 102,15,56,222,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,223,215 - .byte 0xf3,0xc3 - -.globl _aesni_cbc_encrypt -.private_extern _aesni_cbc_encrypt - -.p2align 4 -_aesni_cbc_encrypt: testq %rdx,%rdx jz L$cbc_ret @@ -3383,12 +1490,12 @@ L$cbc_enc_loop: xorps %xmm0,%xmm3 leaq 32(%rcx),%rcx xorps %xmm3,%xmm2 -L$oop_enc1_15: +L$oop_enc1_6: .byte 102,15,56,220,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_15 + jnz L$oop_enc1_6 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -3434,12 +1541,12 @@ L$cbc_decrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_16: +L$oop_dec1_7: .byte 102,15,56,222,209 decl %r10d movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 + jnz L$oop_dec1_7 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -3452,7 +1559,9 @@ L$oop_dec1_16: .p2align 4 L$cbc_decrypt_bulk: leaq (%rsp),%r11 + pushq %rbp + subq $16,%rsp andq $-16,%rsp movq %rcx,%rbp @@ -3850,12 +1959,12 @@ L$cbc_dec_one: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_17: +L$oop_dec1_8: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_17 + jnz L$oop_dec1_8 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -3937,16 +2046,21 @@ L$cbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 movq -8(%r11),%rbp + leaq (%r11),%rsp + L$cbc_ret: .byte 0xf3,0xc3 -.globl _aesni_set_decrypt_key -.private_extern _aesni_set_decrypt_key + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key .p2align 4 -_aesni_set_decrypt_key: +_aes_hw_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3979,16 +2093,23 @@ L$dec_key_inverse: pxor %xmm0,%xmm0 L$dec_key_ret: addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_decrypt_key: -.globl _aesni_set_encrypt_key -.private_extern _aesni_set_encrypt_key +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key .p2align 4 -_aesni_set_encrypt_key: +_aes_hw_set_encrypt_key: __aesni_set_encrypt_key: + +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit+3(%rip) +#endif .byte 0x48,0x83,0xEC,0x08 + movq $-1,%rax testq %rdi,%rdi jz L$enc_key_ret @@ -4282,7 +2403,9 @@ L$enc_key_ret: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_encrypt_key: .p2align 4 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S deleted file mode 100644 index 195abd3b5c..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ /dev/null @@ -1,2500 +0,0 @@ -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -.text - - - - - -.p2align 6 -_bsaes_encrypt8: - leaq L$BS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa 80(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp L$enc_sbox -.p2align 4 -L$enc_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -L$enc_sbox: - pxor %xmm5,%xmm4 - pxor %xmm0,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm1,%xmm5 - pxor %xmm15,%xmm4 - - pxor %xmm2,%xmm5 - pxor %xmm6,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm3,%xmm2 - pxor %xmm4,%xmm3 - pxor %xmm0,%xmm2 - - pxor %xmm6,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm6,%xmm10 - movdqa %xmm0,%xmm9 - movdqa %xmm4,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm11 - - pxor %xmm3,%xmm10 - pxor %xmm1,%xmm9 - pxor %xmm2,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm3,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm15,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm2,%xmm11 - pxor %xmm15,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm6,%xmm12 - movdqa %xmm4,%xmm11 - pxor %xmm0,%xmm12 - pxor %xmm5,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm1,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm3,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm0,%xmm13 - pand %xmm2,%xmm11 - movdqa %xmm6,%xmm14 - pand %xmm15,%xmm12 - pand %xmm4,%xmm13 - por %xmm5,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm5,%xmm11 - movdqa %xmm4,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm5,%xmm9 - pxor %xmm4,%xmm5 - pand %xmm14,%xmm4 - pand %xmm13,%xmm5 - pxor %xmm4,%xmm5 - pxor %xmm9,%xmm4 - pxor %xmm15,%xmm11 - pxor %xmm2,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm2,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm2 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm2,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm11,%xmm5 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm2 - - movdqa %xmm6,%xmm11 - movdqa %xmm0,%xmm7 - pxor %xmm3,%xmm11 - pxor %xmm1,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm3,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm1,%xmm3 - pand %xmm14,%xmm7 - pand %xmm12,%xmm1 - pand %xmm13,%xmm11 - pand %xmm8,%xmm3 - pxor %xmm11,%xmm7 - pxor %xmm1,%xmm3 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm1 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm6,%xmm10 - pxor %xmm0,%xmm6 - pand %xmm14,%xmm0 - pand %xmm13,%xmm6 - pxor %xmm0,%xmm6 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm6 - pxor %xmm11,%xmm3 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm1 - pxor %xmm15,%xmm6 - pxor %xmm5,%xmm0 - pxor %xmm6,%xmm3 - pxor %xmm15,%xmm5 - pxor %xmm0,%xmm15 - - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 - pxor %xmm2,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm4,%xmm3 - - pxor %xmm2,%xmm5 - decl %r10d - jl L$enc_done - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm3,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm5,%xmm10 - pxor %xmm9,%xmm3 - pshufd $0x93,%xmm2,%xmm11 - pxor %xmm10,%xmm5 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm2 - pshufd $0x93,%xmm1,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm1 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm2,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm5,%xmm11 - pshufd $0x4E,%xmm2,%xmm7 - pxor %xmm1,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm3,%xmm10 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm1,%xmm5 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm12,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm14,%xmm6 - pxor %xmm13,%xmm5 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm1 - movdqa %xmm8,%xmm4 - movdqa 48(%r11),%xmm7 - jnz L$enc_loop - movdqa 64(%r11),%xmm7 - jmp L$enc_loop -.p2align 4 -L$enc_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm2,%xmm10 - psrlq $1,%xmm2 - pxor %xmm4,%xmm1 - pxor %xmm6,%xmm2 - pand %xmm7,%xmm1 - pand %xmm7,%xmm2 - pxor %xmm1,%xmm4 - psllq $1,%xmm1 - pxor %xmm2,%xmm6 - psllq $1,%xmm2 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm2 - movdqa %xmm3,%xmm9 - psrlq $1,%xmm3 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm5,%xmm3 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm3 - pand %xmm7,%xmm15 - pxor %xmm3,%xmm5 - psllq $1,%xmm3 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm2,%xmm10 - psrlq $2,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm1,%xmm2 - pand %xmm8,%xmm6 - pand %xmm8,%xmm2 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm2,%xmm1 - psllq $2,%xmm2 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm2 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm5,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm5 - psllq $2,%xmm0 - pxor %xmm15,%xmm3 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm5,%xmm9 - psrlq $4,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $4,%xmm3 - pxor %xmm4,%xmm5 - pxor %xmm1,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm4 - psllq $4,%xmm5 - pxor %xmm3,%xmm1 - psllq $4,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm2,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm2 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 - - - -.p2align 6 -_bsaes_decrypt8: - leaq L$BS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa -48(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp L$dec_sbox -.p2align 4 -L$dec_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -L$dec_sbox: - pxor %xmm3,%xmm2 - - pxor %xmm6,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm3,%xmm5 - pxor %xmm5,%xmm6 - pxor %xmm6,%xmm0 - - pxor %xmm0,%xmm15 - pxor %xmm4,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm15,%xmm4 - pxor %xmm2,%xmm0 - movdqa %xmm2,%xmm10 - movdqa %xmm6,%xmm9 - movdqa %xmm0,%xmm8 - movdqa %xmm3,%xmm12 - movdqa %xmm4,%xmm11 - - pxor %xmm15,%xmm10 - pxor %xmm3,%xmm9 - pxor %xmm5,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm15,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm1,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm5,%xmm11 - pxor %xmm1,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm2,%xmm12 - movdqa %xmm0,%xmm11 - pxor %xmm6,%xmm12 - pxor %xmm4,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm3,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm15,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm6,%xmm13 - pand %xmm5,%xmm11 - movdqa %xmm2,%xmm14 - pand %xmm1,%xmm12 - pand %xmm0,%xmm13 - por %xmm4,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm4,%xmm11 - movdqa %xmm0,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm4,%xmm9 - pxor %xmm0,%xmm4 - pand %xmm14,%xmm0 - pand %xmm13,%xmm4 - pxor %xmm0,%xmm4 - pxor %xmm9,%xmm0 - pxor %xmm1,%xmm11 - pxor %xmm5,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm1,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm5,%xmm1 - pand %xmm14,%xmm7 - pand %xmm12,%xmm5 - pand %xmm13,%xmm11 - pand %xmm8,%xmm1 - pxor %xmm11,%xmm7 - pxor %xmm5,%xmm1 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm5 - pxor %xmm11,%xmm4 - pxor %xmm11,%xmm1 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm5 - - movdqa %xmm2,%xmm11 - movdqa %xmm6,%xmm7 - pxor %xmm15,%xmm11 - pxor %xmm3,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm3,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm3 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm3,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm3 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm2,%xmm10 - pxor %xmm6,%xmm2 - pand %xmm14,%xmm6 - pand %xmm13,%xmm2 - pxor %xmm6,%xmm2 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm4,%xmm5 - - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm6,%xmm4 - pxor %xmm1,%xmm3 - pxor %xmm15,%xmm6 - pxor %xmm4,%xmm3 - pxor %xmm5,%xmm2 - pxor %xmm0,%xmm5 - pxor %xmm3,%xmm2 - - pxor %xmm15,%xmm3 - pxor %xmm2,%xmm6 - decl %r10d - jl L$dec_done - - pshufd $0x4E,%xmm15,%xmm7 - pshufd $0x4E,%xmm2,%xmm13 - pxor %xmm15,%xmm7 - pshufd $0x4E,%xmm4,%xmm14 - pxor %xmm2,%xmm13 - pshufd $0x4E,%xmm0,%xmm8 - pxor %xmm4,%xmm14 - pshufd $0x4E,%xmm5,%xmm9 - pxor %xmm0,%xmm8 - pshufd $0x4E,%xmm3,%xmm10 - pxor %xmm5,%xmm9 - pxor %xmm13,%xmm15 - pxor %xmm13,%xmm0 - pshufd $0x4E,%xmm1,%xmm11 - pxor %xmm3,%xmm10 - pxor %xmm7,%xmm5 - pxor %xmm8,%xmm3 - pshufd $0x4E,%xmm6,%xmm12 - pxor %xmm1,%xmm11 - pxor %xmm14,%xmm0 - pxor %xmm9,%xmm1 - pxor %xmm6,%xmm12 - - pxor %xmm14,%xmm5 - pxor %xmm13,%xmm3 - pxor %xmm13,%xmm1 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm14,%xmm1 - pxor %xmm14,%xmm6 - pxor %xmm12,%xmm4 - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm5,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm3,%xmm10 - pxor %xmm9,%xmm5 - pshufd $0x93,%xmm1,%xmm11 - pxor %xmm10,%xmm3 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm1 - pshufd $0x93,%xmm2,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm2 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm1,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm3,%xmm11 - pshufd $0x4E,%xmm1,%xmm7 - pxor %xmm2,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm5,%xmm10 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm2,%xmm3 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm12,%xmm8 - pxor %xmm1,%xmm10 - pxor %xmm14,%xmm6 - pxor %xmm3,%xmm13 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm2 - movdqa %xmm13,%xmm5 - movdqa %xmm8,%xmm4 - movdqa %xmm2,%xmm1 - movdqa %xmm10,%xmm2 - movdqa -16(%r11),%xmm7 - jnz L$dec_loop - movdqa -32(%r11),%xmm7 - jmp L$dec_loop -.p2align 4 -L$dec_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm2,%xmm9 - psrlq $1,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $1,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm6,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm4 - psllq $1,%xmm2 - pxor %xmm1,%xmm6 - psllq $1,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm3,%xmm5 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm5 - pand %xmm7,%xmm15 - pxor %xmm5,%xmm3 - psllq $1,%xmm5 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm1,%xmm10 - psrlq $2,%xmm1 - pxor %xmm4,%xmm6 - pxor %xmm2,%xmm1 - pand %xmm8,%xmm6 - pand %xmm8,%xmm1 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm1,%xmm2 - psllq $2,%xmm1 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm3 - psllq $2,%xmm0 - pxor %xmm15,%xmm5 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm3,%xmm9 - psrlq $4,%xmm3 - movdqa %xmm5,%xmm10 - psrlq $4,%xmm5 - pxor %xmm4,%xmm3 - pxor %xmm2,%xmm5 - pand %xmm7,%xmm3 - pand %xmm7,%xmm5 - pxor %xmm3,%xmm4 - psllq $4,%xmm3 - pxor %xmm5,%xmm2 - psllq $4,%xmm5 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm5 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm1 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 - - -.p2align 4 -_bsaes_key_convert: - leaq L$masks(%rip),%r11 - movdqu (%rcx),%xmm7 - leaq 16(%rcx),%rcx - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - movdqa 48(%r11),%xmm3 - movdqa 64(%r11),%xmm4 - pcmpeqd %xmm5,%xmm5 - - movdqu (%rcx),%xmm6 - movdqa %xmm7,(%rax) - leaq 16(%rax),%rax - decl %r10d - jmp L$key_loop -.p2align 4 -L$key_loop: -.byte 102,15,56,0,244 - - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - - pand %xmm6,%xmm8 - pand %xmm6,%xmm9 - movdqa %xmm2,%xmm10 - pcmpeqb %xmm0,%xmm8 - psllq $4,%xmm0 - movdqa %xmm3,%xmm11 - pcmpeqb %xmm1,%xmm9 - psllq $4,%xmm1 - - pand %xmm6,%xmm10 - pand %xmm6,%xmm11 - movdqa %xmm0,%xmm12 - pcmpeqb %xmm2,%xmm10 - psllq $4,%xmm2 - movdqa %xmm1,%xmm13 - pcmpeqb %xmm3,%xmm11 - psllq $4,%xmm3 - - movdqa %xmm2,%xmm14 - movdqa %xmm3,%xmm15 - pxor %xmm5,%xmm8 - pxor %xmm5,%xmm9 - - pand %xmm6,%xmm12 - pand %xmm6,%xmm13 - movdqa %xmm8,0(%rax) - pcmpeqb %xmm0,%xmm12 - psrlq $4,%xmm0 - movdqa %xmm9,16(%rax) - pcmpeqb %xmm1,%xmm13 - psrlq $4,%xmm1 - leaq 16(%rcx),%rcx - - pand %xmm6,%xmm14 - pand %xmm6,%xmm15 - movdqa %xmm10,32(%rax) - pcmpeqb %xmm2,%xmm14 - psrlq $4,%xmm2 - movdqa %xmm11,48(%rax) - pcmpeqb %xmm3,%xmm15 - psrlq $4,%xmm3 - movdqu (%rcx),%xmm6 - - pxor %xmm5,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm12,64(%rax) - movdqa %xmm13,80(%rax) - movdqa %xmm14,96(%rax) - movdqa %xmm15,112(%rax) - leaq 128(%rax),%rax - decl %r10d - jnz L$key_loop - - movdqa 80(%r11),%xmm7 - - .byte 0xf3,0xc3 - - -.globl _bsaes_cbc_encrypt -.private_extern _bsaes_cbc_encrypt - -.p2align 4 -_bsaes_cbc_encrypt: - cmpl $0,%r9d - jne _asm_AES_cbc_encrypt - cmpq $128,%rdx - jb _asm_AES_cbc_encrypt - - movq %rsp,%rax -L$cbc_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movq %r8,%rbx - shrq $4,%r14 - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - movdqu (%rbx),%xmm14 - subq $8,%r14 -L$cbc_dec_loop: - movdqu 0(%r12),%xmm15 - movdqu 16(%r12),%xmm0 - movdqu 32(%r12),%xmm1 - movdqu 48(%r12),%xmm2 - movdqu 64(%r12),%xmm3 - movdqu 80(%r12),%xmm4 - movq %rsp,%rax - movdqu 96(%r12),%xmm5 - movl %edx,%r10d - movdqu 112(%r12),%xmm6 - movdqa %xmm14,32(%rbp) - - call _bsaes_decrypt8 - - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm2 - movdqu 112(%r12),%xmm14 - pxor %xmm13,%xmm4 - movdqu %xmm15,0(%r13) - leaq 128(%r12),%r12 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - subq $8,%r14 - jnc L$cbc_dec_loop - - addq $8,%r14 - jz L$cbc_dec_done - - movdqu 0(%r12),%xmm15 - movq %rsp,%rax - movl %edx,%r10d - cmpq $2,%r14 - jb L$cbc_dec_one - movdqu 16(%r12),%xmm0 - je L$cbc_dec_two - movdqu 32(%r12),%xmm1 - cmpq $4,%r14 - jb L$cbc_dec_three - movdqu 48(%r12),%xmm2 - je L$cbc_dec_four - movdqu 64(%r12),%xmm3 - cmpq $6,%r14 - jb L$cbc_dec_five - movdqu 80(%r12),%xmm4 - je L$cbc_dec_six - movdqu 96(%r12),%xmm5 - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm14 - pxor %xmm12,%xmm2 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_six: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm14 - pxor %xmm11,%xmm6 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_five: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm14 - pxor %xmm10,%xmm1 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_four: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm14 - pxor %xmm9,%xmm3 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_three: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm14 - pxor %xmm8,%xmm5 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_two: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm14 - pxor %xmm7,%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_one: - leaq (%r12),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm14 - movdqu %xmm14,(%r13) - movdqa %xmm15,%xmm14 - -L$cbc_dec_done: - movdqu %xmm14,(%rbx) - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$cbc_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$cbc_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$cbc_dec_epilogue: - .byte 0xf3,0xc3 - - -.globl _bsaes_ctr32_encrypt_blocks -.private_extern _bsaes_ctr32_encrypt_blocks - -.p2align 4 -_bsaes_ctr32_encrypt_blocks: - movq %rsp,%rax -L$ctr_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movdqu (%r8),%xmm0 - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movdqa %xmm0,32(%rbp) - cmpq $8,%rdx - jb L$ctr_enc_short - - movl %eax,%ebx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %ebx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - movdqa (%rsp),%xmm8 - leaq L$ADD1(%rip),%r11 - movdqa 32(%rbp),%xmm15 - movdqa -32(%r11),%xmm7 -.byte 102,68,15,56,0,199 -.byte 102,68,15,56,0,255 - movdqa %xmm8,(%rsp) - jmp L$ctr_enc_loop -.p2align 4 -L$ctr_enc_loop: - movdqa %xmm15,32(%rbp) - movdqa %xmm15,%xmm0 - movdqa %xmm15,%xmm1 - paddd 0(%r11),%xmm0 - movdqa %xmm15,%xmm2 - paddd 16(%r11),%xmm1 - movdqa %xmm15,%xmm3 - paddd 32(%r11),%xmm2 - movdqa %xmm15,%xmm4 - paddd 48(%r11),%xmm3 - movdqa %xmm15,%xmm5 - paddd 64(%r11),%xmm4 - movdqa %xmm15,%xmm6 - paddd 80(%r11),%xmm5 - paddd 96(%r11),%xmm6 - - - - movdqa (%rsp),%xmm8 - leaq 16(%rsp),%rax - movdqa -16(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq L$BS0(%rip),%r11 - movl %ebx,%r10d - - call _bsaes_encrypt8_bitslice - - subq $8,%r14 - jc L$ctr_enc_loop_done - - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - movdqu 32(%r12),%xmm9 - movdqu 48(%r12),%xmm10 - movdqu 64(%r12),%xmm11 - movdqu 80(%r12),%xmm12 - movdqu 96(%r12),%xmm13 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - pxor %xmm15,%xmm7 - movdqa 32(%rbp),%xmm15 - pxor %xmm8,%xmm0 - movdqu %xmm7,0(%r13) - pxor %xmm9,%xmm3 - movdqu %xmm0,16(%r13) - pxor %xmm10,%xmm5 - movdqu %xmm3,32(%r13) - pxor %xmm11,%xmm2 - movdqu %xmm5,48(%r13) - pxor %xmm12,%xmm6 - movdqu %xmm2,64(%r13) - pxor %xmm13,%xmm1 - movdqu %xmm6,80(%r13) - pxor %xmm14,%xmm4 - movdqu %xmm1,96(%r13) - leaq L$ADD1(%rip),%r11 - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - paddd 112(%r11),%xmm15 - jnz L$ctr_enc_loop - - jmp L$ctr_enc_done -.p2align 4 -L$ctr_enc_loop_done: - addq $8,%r14 - movdqu 0(%r12),%xmm7 - pxor %xmm7,%xmm15 - movdqu %xmm15,0(%r13) - cmpq $2,%r14 - jb L$ctr_enc_done - movdqu 16(%r12),%xmm8 - pxor %xmm8,%xmm0 - movdqu %xmm0,16(%r13) - je L$ctr_enc_done - movdqu 32(%r12),%xmm9 - pxor %xmm9,%xmm3 - movdqu %xmm3,32(%r13) - cmpq $4,%r14 - jb L$ctr_enc_done - movdqu 48(%r12),%xmm10 - pxor %xmm10,%xmm5 - movdqu %xmm5,48(%r13) - je L$ctr_enc_done - movdqu 64(%r12),%xmm11 - pxor %xmm11,%xmm2 - movdqu %xmm2,64(%r13) - cmpq $6,%r14 - jb L$ctr_enc_done - movdqu 80(%r12),%xmm12 - pxor %xmm12,%xmm6 - movdqu %xmm6,80(%r13) - je L$ctr_enc_done - movdqu 96(%r12),%xmm13 - pxor %xmm13,%xmm1 - movdqu %xmm1,96(%r13) - jmp L$ctr_enc_done - -.p2align 4 -L$ctr_enc_short: - leaq 32(%rbp),%rdi - leaq 48(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_encrypt - movdqu (%r12),%xmm0 - leaq 16(%r12),%r12 - movl 44(%rbp),%eax - bswapl %eax - pxor 48(%rbp),%xmm0 - incl %eax - movdqu %xmm0,(%r13) - bswapl %eax - leaq 16(%r13),%r13 - movl %eax,44(%rsp) - decq %r14 - jnz L$ctr_enc_short - -L$ctr_enc_done: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$ctr_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$ctr_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$ctr_enc_epilogue: - .byte 0xf3,0xc3 - -.globl _bsaes_xts_encrypt -.private_extern _bsaes_xts_encrypt - -.p2align 4 -_bsaes_xts_encrypt: - movq %rsp,%rax -L$xts_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call _asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - andq $-16,%r14 - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc L$xts_enc_short - jmp L$xts_enc_loop - -.p2align 4 -L$xts_enc_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm1,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc L$xts_enc_loop - -L$xts_enc_short: - addq $0x80,%r14 - jz L$xts_enc_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je L$xts_enc_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je L$xts_enc_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je L$xts_enc_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je L$xts_enc_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je L$xts_enc_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je L$xts_enc_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - movdqu %xmm1,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - movdqu %xmm2,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - movdqu %xmm5,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - movdqu %xmm3,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_encrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -L$xts_enc_done: - andl $15,%ebx - jz L$xts_enc_ret - movq %r13,%rdx - -L$xts_enc_steal: - movzbl (%r12),%eax - movzbl -16(%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,-16(%rdx) - movb %cl,0(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz L$xts_enc_steal - - movdqu -16(%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _asm_AES_encrypt - pxor 32(%rbp),%xmm6 - movdqu %xmm6,-16(%r13) - -L$xts_enc_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$xts_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$xts_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$xts_enc_epilogue: - .byte 0xf3,0xc3 - - -.globl _bsaes_xts_decrypt -.private_extern _bsaes_xts_decrypt - -.p2align 4 -_bsaes_xts_decrypt: - movq %rsp,%rax -L$xts_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call _asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - xorl %eax,%eax - andq $-16,%r14 - testl $15,%ebx - setnz %al - shlq $4,%rax - subq %rax,%r14 - - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc L$xts_dec_short - jmp L$xts_dec_loop - -.p2align 4 -L$xts_dec_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc L$xts_dec_loop - -L$xts_dec_short: - addq $0x80,%r14 - jz L$xts_dec_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je L$xts_dec_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je L$xts_dec_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je L$xts_dec_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je L$xts_dec_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je L$xts_dec_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je L$xts_dec_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -L$xts_dec_done: - andl $15,%ebx - jz L$xts_dec_ret - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - movdqa %xmm6,%xmm5 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - movdqu (%r12),%xmm15 - pxor %xmm13,%xmm6 - - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm6 - movq %r13,%rdx - movdqu %xmm6,(%r13) - -L$xts_dec_steal: - movzbl 16(%r12),%eax - movzbl (%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,(%rdx) - movb %cl,16(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz L$xts_dec_steal - - movdqu (%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm5,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm5 - movdqu %xmm5,(%r13) - -L$xts_dec_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$xts_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$xts_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$xts_dec_epilogue: - .byte 0xf3,0xc3 - - -.p2align 6 -_bsaes_const: -L$M0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -L$ISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -L$ISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -L$BS0: -.quad 0x5555555555555555, 0x5555555555555555 -L$BS1: -.quad 0x3333333333333333, 0x3333333333333333 -L$BS2: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -L$SR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -L$SRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -L$M0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -L$SWPUP: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -L$SWPUPM0SR: -.quad 0x0a0d02060c03070b, 0x0004080f05090e01 -L$ADD1: -.quad 0x0000000000000000, 0x0000000100000000 -L$ADD2: -.quad 0x0000000000000000, 0x0000000200000000 -L$ADD3: -.quad 0x0000000000000000, 0x0000000300000000 -L$ADD4: -.quad 0x0000000000000000, 0x0000000400000000 -L$ADD5: -.quad 0x0000000000000000, 0x0000000500000000 -L$ADD6: -.quad 0x0000000000000000, 0x0000000600000000 -L$ADD7: -.quad 0x0000000000000000, 0x0000000700000000 -L$ADD8: -.quad 0x0000000000000000, 0x0000000800000000 -L$xts_magic: -.long 0x87,0,1,0 -L$masks: -.quad 0x0101010101010101, 0x0101010101010101 -.quad 0x0202020202020202, 0x0202020202020202 -.quad 0x0404040404040404, 0x0404040404040404 -.quad 0x0808080808080808, 0x0808080808080808 -L$M0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -L$63: -.quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 -.p2align 6 - -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S new file mode 100644 index 0000000000..1b9129f2dd --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S @@ -0,0 +1,426 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + + +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.p2align 4 +_gcm_gmult_ssse3: + +L$gmult_seh_begin: + movdqu (%rdi),%xmm0 + movdqa L$reverse_bytes(%rip),%xmm10 + movdqa L$low4_mask(%rip),%xmm2 + + +.byte 102,65,15,56,0,194 + + + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + + + + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_1: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_1 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_2: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_2 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +L$oop_row_3: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_3 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + +.byte 102,65,15,56,0,210 + movdqu %xmm2,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +L$gmult_seh_end: + + + + + + + + +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.p2align 4 +_gcm_ghash_ssse3: +L$ghash_seh_begin: + + movdqu (%rdi),%xmm0 + movdqa L$reverse_bytes(%rip),%xmm10 + movdqa L$low4_mask(%rip),%xmm11 + + + andq $-16,%rcx + + + +.byte 102,65,15,56,0,194 + + + pxor %xmm3,%xmm3 +L$oop_ghash: + + movdqu (%rdx),%xmm1 +.byte 102,65,15,56,0,202 + pxor %xmm1,%xmm0 + + + movdqa %xmm11,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm11,%xmm0 + + + + + pxor %xmm2,%xmm2 + + movq $5,%rax +L$oop_row_4: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_4 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_5: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_5 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +L$oop_row_6: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_6 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + + + leaq -256(%rsi),%rsi + + + leaq 16(%rdx),%rdx + subq $16,%rcx + jnz L$oop_ghash + + +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +L$ghash_seh_end: + + + +.p2align 4 + + +L$reverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +L$low4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S index 78b88cc28d..d7dcf5d61f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,13 +19,21 @@ .p2align 4 _gcm_gmult_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + L$gmult_prologue: movzbq 15(%rdi),%r8 @@ -91,23 +111,35 @@ L$break1: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$gmult_epilogue: .byte 0xf3,0xc3 + .globl _gcm_ghash_4bit .private_extern _gcm_ghash_4bit .p2align 4 _gcm_ghash_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + L$ghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -653,21 +685,31 @@ L$outer_loop: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq 0(%rsi),%rsp + L$ghash_epilogue: .byte 0xf3,0xc3 + .globl _gcm_init_clmul .private_extern _gcm_init_clmul .p2align 4 _gcm_init_clmul: + L$_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -820,11 +862,13 @@ L$_init_clmul: movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 + .globl _gcm_gmult_clmul .private_extern _gcm_gmult_clmul .p2align 4 _gcm_gmult_clmul: + L$_gmult_clmul: movdqu (%rdi),%xmm0 movdqa L$bswap_mask(%rip),%xmm5 @@ -872,11 +916,13 @@ L$_gmult_clmul: movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 + .globl _gcm_ghash_clmul .private_extern _gcm_ghash_clmul .p2align 5 _gcm_ghash_clmul: + L$_ghash_clmul: movdqa L$bswap_mask(%rip),%xmm10 @@ -1257,11 +1303,13 @@ L$done: movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 + .globl _gcm_init_avx .private_extern _gcm_init_avx .p2align 5 _gcm_init_avx: + vzeroupper vmovdqu (%rsi),%xmm2 @@ -1365,18 +1413,22 @@ L$init_start_avx: vzeroupper .byte 0xf3,0xc3 + .globl _gcm_gmult_avx .private_extern _gcm_gmult_avx .p2align 5 _gcm_gmult_avx: + jmp L$_gmult_clmul + .globl _gcm_ghash_avx .private_extern _gcm_ghash_avx .p2align 5 _gcm_ghash_avx: + vzeroupper vmovdqu (%rdi),%xmm10 @@ -1749,6 +1801,7 @@ L$tail_no_xor_avx: vzeroupper .byte 0xf3,0xc3 + .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S index 776c116046..cfb4180da3 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .p2align 4 @@ -6,11 +18,17 @@ .private_extern _md5_block_asm_data_order _md5_block_asm_data_order: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r14 + pushq %r15 + L$prologue: @@ -660,12 +678,19 @@ L$end: movl %edx,12(%rbp) movq (%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r12 + movq 24(%rsp),%rbx + movq 32(%rsp),%rbp + addq $40,%rsp + L$epilogue: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index f7875772ad..1f4a93132f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -17,15 +29,25 @@ L$ONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe +L$ord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +L$ordK: +.quad 0xccd1c8aaee00bc4f + + .globl _ecp_nistz256_neg .private_extern _ecp_nistz256_neg .p2align 5 _ecp_nistz256_neg: + pushq %r12 + pushq %r13 +L$neg_body: + xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 @@ -58,8 +80,13 @@ _ecp_nistz256_neg: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$neg_epilogue: .byte 0xf3,0xc3 @@ -68,18 +95,1101 @@ _ecp_nistz256_neg: + +.globl _ecp_nistz256_ord_mul_mont +.private_extern _ecp_nistz256_ord_mul_mont + +.p2align 5 +_ecp_nistz256_ord_mul_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$ecp_nistz256_ord_mul_montx + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq L$ord(%rip),%r14 + movq L$ordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mul_epilogue: + .byte 0xf3,0xc3 + + + + + + + + + +.globl _ecp_nistz256_ord_sqr_mont +.private_extern _ecp_nistz256_ord_sqr_mont + +.p2align 5 +_ecp_nistz256_ord_sqr_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$ecp_nistz256_ord_sqr_montx + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq L$ord(%rip),%rsi + movq %rdx,%rbx + jmp L$oop_ord_sqr + +.p2align 5 +L$oop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz L$oop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqr_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +ecp_nistz256_ord_mul_montx: + +L$ecp_nistz256_ord_mul_montx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq L$ord-128(%rip),%r14 + movq L$ordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mulx_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +ecp_nistz256_ord_sqr_montx: + +L$ecp_nistz256_ord_sqr_montx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq L$ord(%rip),%rsi + jmp L$oop_ord_sqrx + +.p2align 5 +L$oop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz L$oop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqrx_epilogue: + .byte 0xf3,0xc3 + + + + + + + + .globl _ecp_nistz256_mul_mont .private_extern _ecp_nistz256_mul_mont .p2align 5 _ecp_nistz256_mul_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx L$mul_mont: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +L$mul_body: + cmpl $0x80100,%ecx + je L$mul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -88,21 +1198,45 @@ L$mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp L$mul_mont_done + +.p2align 5 +L$mul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx L$mul_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_mul_montq: + movq %rax,%rbp mulq %r9 movq L$poly+8(%rip),%r14 @@ -321,36 +1455,72 @@ __ecp_nistz256_mul_montq: + .globl _ecp_nistz256_sqr_mont .private_extern _ecp_nistz256_sqr_mont .p2align 5 _ecp_nistz256_sqr_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +L$sqr_body: + cmpl $0x80100,%ecx + je L$sqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp L$sqr_mont_done + +.p2align 5 +L$sqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx L$sqr_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqr_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_sqr_montq: + movq %rax,%r13 mulq %r14 movq %rax,%r9 @@ -511,11 +1681,311 @@ __ecp_nistz256_sqr_montq: +.p2align 5 +__ecp_nistz256_mul_montx: + + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq L$poly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq L$poly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_sqr_montx: + + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq L$poly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq L$poly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 + + + + .globl _ecp_nistz256_select_w5 .private_extern _ecp_nistz256_select_w5 .p2align 5 _ecp_nistz256_select_w5: + leaq _OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -572,6 +2042,8 @@ L$select_loop_sse_w5: movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_select_w5: + @@ -580,6 +2052,7 @@ L$select_loop_sse_w5: .p2align 5 _ecp_nistz256_select_w7: + leaq _OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -625,11 +2098,14 @@ L$select_loop_sse_w7: movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_select_w7: + .p2align 5 ecp_nistz256_avx2_select_w5: + L$avx2_select_w5: vzeroupper vmovdqa L$Two(%rip),%ymm0 @@ -685,6 +2161,8 @@ L$select_loop_avx2_w5: vzeroupper .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_avx2_select_w5: + @@ -693,6 +2171,7 @@ L$select_loop_avx2_w5: .p2align 5 _ecp_nistz256_avx2_select_w7: + L$avx2_select_w7: vzeroupper vmovdqa L$Three(%rip),%ymm0 @@ -763,9 +2242,12 @@ L$select_loop_avx2_w7: vzeroupper .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_avx2_select_w7: + .p2align 5 __ecp_nistz256_add_toq: + xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 @@ -796,8 +2278,10 @@ __ecp_nistz256_add_toq: + .p2align 5 __ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax @@ -827,8 +2311,10 @@ __ecp_nistz256_sub_fromq: + .p2align 5 __ecp_nistz256_subq: + subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 @@ -854,8 +2340,10 @@ __ecp_nistz256_subq: + .p2align 5 __ecp_nistz256_mul_by_2q: + xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 @@ -884,19 +2372,34 @@ __ecp_nistz256_mul_by_2q: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_double .private_extern _ecp_nistz256_point_double .p2align 5 _ecp_nistz256_point_double: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$point_doublex pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp +L$point_doubleq_body: + L$point_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx @@ -1078,28 +2581,53 @@ L$point_double_shortcutq: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doubleq_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_add .private_extern _ecp_nistz256_point_add .p2align 5 _ecp_nistz256_point_add: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$point_addx pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp +L$point_addq_body: + movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -1244,15 +2772,22 @@ _ecp_nistz256_point_add: orq %r8,%r12 orq %r9,%r12 -.byte 0x3e - jnz L$add_proceedq .byte 102,73,15,126,208 .byte 102,73,15,126,217 - testq %r8,%r8 + orq %r8,%r12 +.byte 0x3e jnz L$add_proceedq + + + testq %r9,%r9 jz L$add_doubleq + + + + + .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) @@ -1268,8 +2803,10 @@ L$add_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp + jmp L$point_double_shortcutq + .p2align 5 L$add_proceedq: movq 0+64(%rsp),%rax @@ -1475,28 +3012,53 @@ L$add_proceedq: movdqu %xmm3,48(%rdi) L$add_doneq: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addq_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_add_affine .private_extern _ecp_nistz256_point_add_affine .p2align 5 _ecp_nistz256_point_add_affine: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$point_add_affinex pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp +L$add_affineq_body: + movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 @@ -1778,13 +3340,1128 @@ _ecp_nistz256_point_add_affine: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affineq_epilogue: .byte 0xf3,0xc3 + + +.p2align 5 +__ecp_nistz256_add_tox: + + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_sub_fromx: + + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_subx: + + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_mul_by_2x: + + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_point_doublex: + +L$point_doublex: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $160+8,%rsp + +L$point_doublex_body: + +L$point_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doublex_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_point_addx: + +L$point_addx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $576+8,%rsp + +L$point_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz L$add_proceedx + + + + testq %r9,%r9 + jz L$add_doublex + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_donex + +.p2align 5 +L$add_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + + jmp L$point_double_shortcutx + + +.p2align 5 +L$add_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_donex: + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addx_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_point_add_affinex: + +L$point_add_affinex: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $480+8,%rsp + +L$add_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affinex_epilogue: + .byte 0xf3,0xc3 + + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S new file mode 100644 index 0000000000..66fcfa3305 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S @@ -0,0 +1,328 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +.private_extern _beeu_mod_inverse_vartime +.globl _beeu_mod_inverse_vartime +.private_extern _beeu_mod_inverse_vartime +.p2align 5 +_beeu_mod_inverse_vartime: + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rbx + + pushq %rsi + + + subq $80,%rsp + + movq %rdi,0(%rsp) + + + movq $1,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %rdi,%rdi + + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + xorq %rbp,%rbp + + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu %xmm0,48(%rsp) + vmovdqu %xmm1,64(%rsp) + + vmovdqu 0(%rdx),%xmm0 + vmovdqu 16(%rdx),%xmm1 + vmovdqu %xmm0,16(%rsp) + vmovdqu %xmm1,32(%rsp) + +L$beeu_loop: + xorq %rbx,%rbx + orq 48(%rsp),%rbx + orq 56(%rsp),%rbx + orq 64(%rsp),%rbx + orq 72(%rsp),%rbx + jz L$beeu_loop_end + + + + + + + + + + + movq $1,%rcx + + +L$beeu_shift_loop_XB: + movq %rcx,%rbx + andq 48(%rsp),%rbx + jnz L$beeu_shift_loop_end_XB + + + movq $1,%rbx + andq %r8,%rbx + jz L$shift1_0 + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq $0,%rdi + +L$shift1_0: + shrdq $1,%r9,%r8 + shrdq $1,%r10,%r9 + shrdq $1,%r11,%r10 + shrdq $1,%rdi,%r11 + shrq $1,%rdi + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne L$beeu_shift_loop_XB + +L$beeu_shift_loop_end_XB: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz L$beeu_no_shift_XB + + + + movq 8+48(%rsp),%rax + movq 16+48(%rsp),%rbx + movq 24+48(%rsp),%rsi + + shrdq %cl,%rax,0+48(%rsp) + shrdq %cl,%rbx,8+48(%rsp) + shrdq %cl,%rsi,16+48(%rsp) + + shrq %cl,%rsi + movq %rsi,24+48(%rsp) + + +L$beeu_no_shift_XB: + + movq $1,%rcx + + +L$beeu_shift_loop_YA: + movq %rcx,%rbx + andq 16(%rsp),%rbx + jnz L$beeu_shift_loop_end_YA + + + movq $1,%rbx + andq %r12,%rbx + jz L$shift1_1 + addq 0(%rdx),%r12 + adcq 8(%rdx),%r13 + adcq 16(%rdx),%r14 + adcq 24(%rdx),%r15 + adcq $0,%rbp + +L$shift1_1: + shrdq $1,%r13,%r12 + shrdq $1,%r14,%r13 + shrdq $1,%r15,%r14 + shrdq $1,%rbp,%r15 + shrq $1,%rbp + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne L$beeu_shift_loop_YA + +L$beeu_shift_loop_end_YA: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz L$beeu_no_shift_YA + + + + movq 8+16(%rsp),%rax + movq 16+16(%rsp),%rbx + movq 24+16(%rsp),%rsi + + shrdq %cl,%rax,0+16(%rsp) + shrdq %cl,%rbx,8+16(%rsp) + shrdq %cl,%rsi,16+16(%rsp) + + shrq %cl,%rsi + movq %rsi,24+16(%rsp) + + +L$beeu_no_shift_YA: + + movq 48(%rsp),%rax + movq 56(%rsp),%rbx + movq 64(%rsp),%rsi + movq 72(%rsp),%rcx + subq 16(%rsp),%rax + sbbq 24(%rsp),%rbx + sbbq 32(%rsp),%rsi + sbbq 40(%rsp),%rcx + jnc L$beeu_B_bigger_than_A + + + movq 16(%rsp),%rax + movq 24(%rsp),%rbx + movq 32(%rsp),%rsi + movq 40(%rsp),%rcx + subq 48(%rsp),%rax + sbbq 56(%rsp),%rbx + sbbq 64(%rsp),%rsi + sbbq 72(%rsp),%rcx + movq %rax,16(%rsp) + movq %rbx,24(%rsp) + movq %rsi,32(%rsp) + movq %rcx,40(%rsp) + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + adcq %rdi,%rbp + jmp L$beeu_loop + +L$beeu_B_bigger_than_A: + + movq %rax,48(%rsp) + movq %rbx,56(%rsp) + movq %rsi,64(%rsp) + movq %rcx,72(%rsp) + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rbp,%rdi + + jmp L$beeu_loop + +L$beeu_loop_end: + + + + + movq 16(%rsp),%rbx + subq $1,%rbx + orq 24(%rsp),%rbx + orq 32(%rsp),%rbx + orq 40(%rsp),%rbx + + jnz L$beeu_err + + + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + xorq %rdi,%rdi + +L$beeu_reduction_loop: + movq %r12,16(%rsp) + movq %r13,24(%rsp) + movq %r14,32(%rsp) + movq %r15,40(%rsp) + movq %rbp,48(%rsp) + + + subq %r8,%r12 + sbbq %r9,%r13 + sbbq %r10,%r14 + sbbq %r11,%r15 + sbbq $0,%rbp + + + cmovcq 16(%rsp),%r12 + cmovcq 24(%rsp),%r13 + cmovcq 32(%rsp),%r14 + cmovcq 40(%rsp),%r15 + jnc L$beeu_reduction_loop + + + subq %r12,%r8 + sbbq %r13,%r9 + sbbq %r14,%r10 + sbbq %r15,%r11 + +L$beeu_save: + + movq 0(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + + movq $1,%rax + jmp L$beeu_finish + +L$beeu_err: + + xorq %rax,%rax + +L$beeu_finish: + addq $80,%rsp + + popq %rsi + + popq %rbx + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbp + + .byte 0xf3,0xc3 + + + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S index b259286f6e..f6f2be7ae1 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -9,31 +21,31 @@ .p2align 4 _CRYPTO_rdrand: + xorq %rax,%rax - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,242 adcq %rax,%rax - movq %rcx,0(%rdi) + movq %rdx,0(%rdi) .byte 0xf3,0xc3 + + .globl _CRYPTO_rdrand_multiple8_buf .private_extern _CRYPTO_rdrand_multiple8_buf .p2align 4 _CRYPTO_rdrand_multiple8_buf: + testq %rsi,%rsi jz L$out movq $8,%rdx L$loop: - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,241 jnc L$err movq %rcx,0(%rdi) addq %rdx,%rdi @@ -45,4 +57,6 @@ L$out: L$err: xorq %rax,%rax .byte 0xf3,0xc3 + + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S index 6eb7afc510..e9cae78c5d 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _rsaz_1024_sqr_avx2 @@ -77,7 +89,7 @@ L$sqr_1024_no_n_copy: vmovdqu 256-128(%rsi),%ymm8 leaq 192(%rsp),%rbx - vpbroadcastq L$and_mask(%rip),%ymm15 + vmovdqu L$and_mask(%rip),%ymm15 jmp L$OOP_GRANDE_SQR_1024 .p2align 5 @@ -829,10 +841,10 @@ L$oop_mul_1024: vpmuludq 192-128(%rcx),%ymm11,%ymm12 vpaddq %ymm12,%ymm6,%ymm6 vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 + vpblendd $3,%ymm14,%ymm9,%ymm12 vpaddq %ymm13,%ymm7,%ymm7 vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm12,%ymm3,%ymm3 vpaddq %ymm0,%ymm8,%ymm8 movq %rbx,%rax @@ -845,7 +857,9 @@ L$oop_mul_1024: vmovdqu -8+64-128(%rsi),%ymm13 movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 andl $0x1fffffff,%eax imulq 16-128(%rsi),%rbx @@ -1074,7 +1088,6 @@ L$oop_mul_1024: decl %r14d jnz L$oop_mul_1024 - vpermq $0,%ymm15,%ymm15 vpaddq (%rsp),%ymm12,%ymm0 vpsrlq $29,%ymm0,%ymm12 @@ -1215,6 +1228,7 @@ L$mul_1024_epilogue: .p2align 5 _rsaz_1024_red2norm_avx2: + subq $-128,%rsi xorq %rax,%rax movq -128(%rsi),%r8 @@ -1408,11 +1422,13 @@ _rsaz_1024_red2norm_avx2: .byte 0xf3,0xc3 + .globl _rsaz_1024_norm2red_avx2 .private_extern _rsaz_1024_norm2red_avx2 .p2align 5 _rsaz_1024_norm2red_avx2: + subq $-128,%rdi movq (%rsi),%r8 movl $0x1fffffff,%eax @@ -1566,11 +1582,13 @@ _rsaz_1024_norm2red_avx2: movq %r8,184(%rdi) .byte 0xf3,0xc3 + .globl _rsaz_1024_scatter5_avx2 .private_extern _rsaz_1024_scatter5_avx2 .p2align 5 _rsaz_1024_scatter5_avx2: + vzeroupper vmovdqu L$scatter_permd(%rip),%ymm5 shll $4,%edx @@ -1592,6 +1610,7 @@ L$oop_scatter_1024: .byte 0xf3,0xc3 + .globl _rsaz_1024_gather5_avx2 .private_extern _rsaz_1024_gather5_avx2 @@ -1714,22 +1733,9 @@ L$oop_gather_1024: L$SEH_end_rsaz_1024_gather5: - -.globl _rsaz_avx2_eligible -.private_extern _rsaz_avx2_eligible - -.p2align 5 -_rsaz_avx2_eligible: - leaq _OPENSSL_ia32cap_P(%rip),%rax - movl 8(%rax),%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 - - .p2align 6 L$and_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff L$scatter_permd: .long 0,2,4,6,7,7,7,7 L$gather_permd: diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S index c22431c89f..ace121e359 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,6 +19,7 @@ .p2align 4 _sha1_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r10 movl 0(%r10),%r9d movl 4(%r10),%r8d @@ -23,17 +36,24 @@ _sha1_block_data_order: .p2align 4 L$ialu: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) + L$prologue: movl 0(%r8),%esi @@ -1228,25 +1248,40 @@ L$loop: jnz L$loop movq 64(%rsp),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp andq $-64,%rsp movq %rdi,%r8 @@ -2403,24 +2438,38 @@ L$done_ssse3: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 4 sha1_block_data_order_avx: _avx_shortcut: + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp vzeroupper andq $-64,%rsp @@ -3517,14 +3566,21 @@ L$done_avx: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S index ac6559e074..5e46e81c16 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,6 +19,7 @@ .p2align 4 _sha256_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -19,12 +32,19 @@ _sha256_block_data_order: testl $512,%r10d jnz L$ssse3_shortcut movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -32,7 +52,8 @@ _sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue: movl 0(%rdi),%eax @@ -1696,17 +1717,26 @@ L$rounds_16_xx: movl %r11d,28(%rdi) jb L$loop - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 K256: @@ -1753,14 +1783,22 @@ K256: .p2align 6 sha256_block_data_order_ssse3: + L$ssse3_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1768,7 +1806,8 @@ L$ssse3_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_ssse3: movl 0(%rdi),%eax @@ -2834,28 +2873,45 @@ L$ssse3_00_47: movl %r11d,28(%rdi) jb L$loop_ssse3 - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 6 sha256_block_data_order_avx: + L$avx_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2863,7 +2919,8 @@ L$avx_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_avx: vzeroupper @@ -3890,16 +3947,25 @@ L$avx_00_47: movl %r11d,28(%rdi) jb L$loop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + vzeroupper movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S index 0b738e6f45..c550e794ac 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,24 +19,30 @@ .p2align 4 _sha512_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $2048,%r10d - jnz L$xop_shortcut andl $1073741824,%r9d andl $268435968,%r10d orl %r9d,%r10d cmpl $1342177792,%r10d je L$avx_shortcut movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -32,7 +50,8 @@ _sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue: movq 0(%rdi),%rax @@ -1696,17 +1715,26 @@ L$rounds_16_xx: movq %r11,56(%rdi) jb L$loop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 K512: @@ -1795,1099 +1823,24 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 -sha512_block_data_order_xop: -L$xop_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) -L$prologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop_xop -.p2align 4 -L$loop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp L$xop_00_47 - -.p2align 4 -L$xop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne L$xop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop_xop - - movq 128+24(%rsp),%rsi - vzeroupper - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$epilogue_xop: - .byte 0xf3,0xc3 - - .p2align 6 sha512_block_data_order_avx: + L$avx_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2895,7 +1848,8 @@ L$avx_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue_avx: vzeroupper @@ -4012,16 +2966,25 @@ L$avx_00_47: movq %r11,56(%rdi) jb L$loop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + vzeroupper movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 867df68b4b..cd52d67e60 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -19,6 +31,7 @@ .p2align 4 _vpaes_encrypt_core: + movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -107,8 +120,185 @@ L$enc_entry: + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core_2x: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa L$k_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc2x_entry + +.p2align 4 +L$enc2x_loop: + + movdqa L$k_sb1(%rip),%xmm4 + movdqa L$k_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa L$k_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa L$k_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +L$enc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz L$enc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + .byte 0xf3,0xc3 + + + + + + + + + .p2align 4 _vpaes_decrypt_core: + movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -213,6 +403,7 @@ L$dec_entry: + .p2align 4 _vpaes_schedule_core: @@ -220,6 +411,7 @@ _vpaes_schedule_core: + call _vpaes_preheat movdqa L$k_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -398,8 +590,10 @@ L$schedule_mangle_last_dec: + .p2align 4 _vpaes_schedule_192_smear: + pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -427,11 +621,13 @@ _vpaes_schedule_192_smear: + .p2align 4 _vpaes_schedule_round: + pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 .byte 102,69,15,58,15,192,15 @@ -496,8 +692,10 @@ _vpaes_schedule_low_round: + .p2align 4 _vpaes_schedule_transform: + movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -532,10 +730,12 @@ _vpaes_schedule_transform: + .p2align 4 _vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 movdqa L$k_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -605,11 +805,18 @@ L$schedule_mangle_both: + .globl _vpaes_set_encrypt_key .private_extern _vpaes_set_encrypt_key .p2align 4 _vpaes_set_encrypt_key: + +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+5(%rip) +#endif + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -622,11 +829,13 @@ _vpaes_set_encrypt_key: .byte 0xf3,0xc3 + .globl _vpaes_set_decrypt_key .private_extern _vpaes_set_decrypt_key .p2align 4 _vpaes_set_decrypt_key: + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -644,11 +853,17 @@ _vpaes_set_decrypt_key: .byte 0xf3,0xc3 + .globl _vpaes_encrypt .private_extern _vpaes_encrypt .p2align 4 _vpaes_encrypt: + +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+4(%rip) +#endif movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core @@ -656,22 +871,26 @@ _vpaes_encrypt: .byte 0xf3,0xc3 + .globl _vpaes_decrypt .private_extern _vpaes_decrypt .p2align 4 _vpaes_decrypt: + movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 + .globl _vpaes_cbc_encrypt .private_extern _vpaes_cbc_encrypt .p2align 4 _vpaes_cbc_encrypt: + xchgq %rcx,%rdx subq $16,%rcx jc L$cbc_abort @@ -709,6 +928,70 @@ L$cbc_abort: .byte 0xf3,0xc3 +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.p2align 4 +_vpaes_ctr32_encrypt_blocks: + + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz L$ctr32_abort + movdqu (%r8),%xmm0 + movdqa L$ctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb L$rev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz L$ctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz L$ctr32_done + +L$ctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +L$ctr32_loop: + movdqa L$rev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa L$ctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz L$ctr32_loop + +L$ctr32_done: +L$ctr32_abort: + .byte 0xf3,0xc3 + + + @@ -717,6 +1000,7 @@ L$cbc_abort: .p2align 4 _vpaes_preheat: + leaq L$k_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -733,6 +1017,7 @@ _vpaes_preheat: + .p2align 6 _vpaes_consts: L$k_inv: @@ -828,6 +1113,17 @@ L$k_dsbe: L$k_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +L$rev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +L$ctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +L$ctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .p2align 6 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S index 4904417a20..8d6444cb6f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -16,6 +28,8 @@ _bn_mul_mont: jnz L$mul_enter cmpl $8,%r9d jb L$mul_enter + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d cmpq %rsi,%rdx jne L$mul4x_enter testl $7,%r9d @@ -207,31 +221,30 @@ L$inner_enter: xorq %r14,%r14 movq (%rsp),%rax - leaq (%rsp),%rsi movq %r9,%r15 - jmp L$sub + .p2align 4 -L$sub: - sbbq (%rcx,%r14,8),%rax +L$sub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax + movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz L$sub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.p2align 4 + L$copy: - movq (%rsi,%r14,8),%rax - movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy @@ -265,6 +278,9 @@ bn_mul4x_mont: movq %rsp,%rax L$mul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je L$mulx4x_enter pushq %rbx pushq %rbp @@ -602,7 +618,6 @@ L$inner4x: movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax - pxor %xmm0,%xmm0 movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi @@ -612,8 +627,7 @@ L$inner4x: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - jmp L$sub4x -.p2align 4 + L$sub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) @@ -640,34 +654,35 @@ L$sub4x: sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) - xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx - leaq -4(%r9),%r15 - orq %rcx,%rsi + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 shrq $2,%r15 + xorl %eax,%eax - movdqu (%rsi),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,(%rdi) jmp L$copy4x .p2align 4 L$copy4x: - movdqu 16(%rsi,%r14,1),%xmm2 - movdqu 32(%rsi,%r14,1),%xmm1 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movdqa %xmm0,32(%rsp,%r14,1) - movdqu %xmm1,32(%rdi,%r14,1) - leaq 32(%r14),%r14 + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax decq %r15 jnz L$copy4x - - movdqu 16(%rsi,%r14,1),%xmm2 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi movq $1,%rax @@ -692,6 +707,7 @@ L$mul4x_epilogue: + .p2align 5 bn_sqr8x_mont: @@ -772,6 +788,26 @@ L$sqr8x_body: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + leaq _OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne L$sqr8x_nox + + call _bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_nox: call _bn_sqr8x_internal @@ -859,6 +895,362 @@ L$sqr8x_epilogue: .byte 0xf3,0xc3 + +.p2align 5 +bn_mulx4x_mont: + + movq %rsp,%rax + +L$mulx4x_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +.p2align 4 +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) + + movq %r9,48(%rsp) + jmp L$mulx4x_body + +.p2align 5 +L$mulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne L$mulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp L$mulx4x_sub + +.p2align 5 +L$mulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz L$mulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + + jmp L$mulx4x_cond_copy + +.p2align 5 +L$mulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz L$mulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + .byte 0xf3,0xc3 + + .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S index abc65f1192..4bd36feae4 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -14,6 +26,8 @@ _bn_mul_mont_gather5: testl $7,%r9d jnz L$mul_enter + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d jmp L$mul4x_enter .p2align 4 @@ -395,8 +409,7 @@ L$inner_enter: movq %r9,%r15 jmp L$sub .p2align 4 -L$sub: - sbbq (%rcx,%r14,8),%rax +L$sub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -404,18 +417,19 @@ L$sub: jnz L$sub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.p2align 4 + L$copy: - movq (%rsi,%r14,8),%rax + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy @@ -450,6 +464,9 @@ bn_mul4x_mont_gather5: movq %rsp,%rax L$mul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je L$mulx4x_enter pushq %rbx pushq %rbp @@ -548,6 +565,7 @@ L$mul4x_epilogue: .p2align 5 mul4x_internal: + shlq $5,%r9 movd 8(%rax),%xmm5 leaq L$inc(%rip),%rax @@ -1070,6 +1088,7 @@ L$inner4x: movq 24(%rbp),%r15 jmp L$sqr4x_sub_entry + .globl _bn_power5 .private_extern _bn_power5 @@ -1078,6 +1097,11 @@ _bn_power5: movq %rsp,%rax + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je L$powerx5_enter pushq %rbx pushq %rbp @@ -1280,6 +1304,7 @@ __bn_sqr8x_internal: + leaq 32(%r10),%rbp @@ -1985,8 +2010,10 @@ L$8x_no_tail: .byte 0xf3,0xc3 + .p2align 5 __bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx @@ -2038,11 +2065,13 @@ L$sqr4x_sub_entry: negq %r9 .byte 0xf3,0xc3 + .globl _bn_from_montgomery .private_extern _bn_from_montgomery .p2align 5 _bn_from_montgomery: + testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax @@ -2050,6 +2079,7 @@ _bn_from_montgomery: + .p2align 5 bn_from_mont8x: @@ -2163,6 +2193,22 @@ L$mul_by_1: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne L$from_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp L$from_mont_zero + +.p2align 5 +L$from_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2201,11 +2247,1356 @@ L$from_epilogue: .byte 0xf3,0xc3 + +.p2align 5 +bn_mulx4x_mont_gather5: + + movq %rsp,%rax + +L$mulx4x_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$mulx4xsp_done + +L$mulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$mulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$mulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +mulx4x_internal: + + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq L$inc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb L$mulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + + + +.p2align 5 +bn_powerx5: + + movq %rsp,%rax + +L$powerx5_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$powerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$pwrx_sp_done + +.p2align 5 +L$pwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$pwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk + jmp L$pwrx_page_walk_done + +L$pwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk +L$pwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$powerx5_epilogue: + .byte 0xf3,0xc3 + + + +.globl _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal + +.p2align 5 +_bn_sqrx8x_internal: +__bn_sqrx8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp L$sqr8x_zero_start + +.p2align 5 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +L$sqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +L$sqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz L$sqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je L$sqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz L$sqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je L$sqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je L$sqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.p2align 5 +L$sqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz L$sqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp L$sqrx4x_shift_n_add + +.p2align 5 +L$sqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp L$sqrx8x_reduction_loop + +.p2align 5 +L$sqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp L$sqrx8x_reduce + +.p2align 5 +L$sqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz L$sqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz L$sqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +L$sqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb L$sqrx8x_reduction_loop + .byte 0xf3,0xc3 + + +.p2align 5 + +__bn_postx4x_internal: + + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + +.p2align 4 +L$sqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz L$sqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 + + .globl _bn_scatter5 .private_extern _bn_scatter5 .p2align 4 _bn_scatter5: + cmpl $0,%esi jz L$scatter_epilogue leaq (%rdx,%rcx,8),%rdx @@ -2220,14 +3611,17 @@ L$scatter_epilogue: .byte 0xf3,0xc3 + .globl _bn_gather5 .private_extern _bn_gather5 .p2align 5 _bn_gather5: + L$SEH_begin_bn_gather5: .byte 0x4c,0x8d,0x14,0x24 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq L$inc(%rip),%rax andq $-16,%rsp @@ -2381,9 +3775,11 @@ L$gather: jnz L$gather leaq (%r10),%rsp + .byte 0xf3,0xc3 L$SEH_end_bn_gather5: + .p2align 6 L$inc: .long 0,0, 1,1 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S new file mode 100644 index 0000000000..863e6b0452 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S @@ -0,0 +1,513 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + + + + + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.p2align 4 +_abi_test_trampoline: +L$abi_test_trampoline_seh_begin: + + + + + + + + + + + subq $120,%rsp + +L$abi_test_trampoline_seh_prolog_alloc: + movq %r8,48(%rsp) + movq %rbx,64(%rsp) + +L$abi_test_trampoline_seh_prolog_rbx: + movq %rbp,72(%rsp) + +L$abi_test_trampoline_seh_prolog_rbp: + movq %r12,80(%rsp) + +L$abi_test_trampoline_seh_prolog_r12: + movq %r13,88(%rsp) + +L$abi_test_trampoline_seh_prolog_r13: + movq %r14,96(%rsp) + +L$abi_test_trampoline_seh_prolog_r14: + movq %r15,104(%rsp) + +L$abi_test_trampoline_seh_prolog_r15: +L$abi_test_trampoline_seh_prolog_end: + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + + + + + movq %rdx,%r10 + movq %rcx,%r11 + decq %r11 + js L$args_done + movq (%r10),%rdi + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rsi + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rdx + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rcx + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%r8 + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%r9 + addq $8,%r10 + leaq 0(%rsp),%rax +L$args_loop: + decq %r11 + js L$args_done + + + + + + + movq %r11,56(%rsp) + movq (%r10),%r11 + movq %r11,(%rax) + movq 56(%rsp),%r11 + + addq $8,%r10 + addq $8,%rax + jmp L$args_loop + +L$args_done: + movq 32(%rsp),%rax + movq 48(%rsp),%r10 + testq %r10,%r10 + jz L$no_unwind + + + pushfq + orq $0x100,0(%rsp) + popfq + + + + nop +.globl _abi_test_unwind_start +.private_extern _abi_test_unwind_start +_abi_test_unwind_start: + + call *%rax +.globl _abi_test_unwind_return +.private_extern _abi_test_unwind_return +_abi_test_unwind_return: + + + + + pushfq + andq $-0x101,0(%rsp) + popfq +.globl _abi_test_unwind_stop +.private_extern _abi_test_unwind_stop +_abi_test_unwind_stop: + + jmp L$call_done + +L$no_unwind: + call *%rax + +L$call_done: + + movq 40(%rsp),%rsi + movq %rbx,0(%rsi) + movq %rbp,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq 64(%rsp),%rbx + + movq 72(%rsp),%rbp + + movq 80(%rsp),%r12 + + movq 88(%rsp),%r13 + + movq 96(%rsp),%r14 + + movq 104(%rsp),%r15 + + addq $120,%rsp + + + + .byte 0xf3,0xc3 + +L$abi_test_trampoline_seh_end: + + +.globl _abi_test_clobber_rax +.private_extern _abi_test_clobber_rax +.p2align 4 +_abi_test_clobber_rax: + xorq %rax,%rax + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rbx +.private_extern _abi_test_clobber_rbx +.p2align 4 +_abi_test_clobber_rbx: + xorq %rbx,%rbx + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rcx +.private_extern _abi_test_clobber_rcx +.p2align 4 +_abi_test_clobber_rcx: + xorq %rcx,%rcx + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rdx +.private_extern _abi_test_clobber_rdx +.p2align 4 +_abi_test_clobber_rdx: + xorq %rdx,%rdx + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rdi +.private_extern _abi_test_clobber_rdi +.p2align 4 +_abi_test_clobber_rdi: + xorq %rdi,%rdi + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rsi +.private_extern _abi_test_clobber_rsi +.p2align 4 +_abi_test_clobber_rsi: + xorq %rsi,%rsi + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rbp +.private_extern _abi_test_clobber_rbp +.p2align 4 +_abi_test_clobber_rbp: + xorq %rbp,%rbp + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r8 +.private_extern _abi_test_clobber_r8 +.p2align 4 +_abi_test_clobber_r8: + xorq %r8,%r8 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r9 +.private_extern _abi_test_clobber_r9 +.p2align 4 +_abi_test_clobber_r9: + xorq %r9,%r9 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r10 +.private_extern _abi_test_clobber_r10 +.p2align 4 +_abi_test_clobber_r10: + xorq %r10,%r10 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r11 +.private_extern _abi_test_clobber_r11 +.p2align 4 +_abi_test_clobber_r11: + xorq %r11,%r11 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r12 +.private_extern _abi_test_clobber_r12 +.p2align 4 +_abi_test_clobber_r12: + xorq %r12,%r12 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r13 +.private_extern _abi_test_clobber_r13 +.p2align 4 +_abi_test_clobber_r13: + xorq %r13,%r13 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r14 +.private_extern _abi_test_clobber_r14 +.p2align 4 +_abi_test_clobber_r14: + xorq %r14,%r14 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r15 +.private_extern _abi_test_clobber_r15 +.p2align 4 +_abi_test_clobber_r15: + xorq %r15,%r15 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm0 +.private_extern _abi_test_clobber_xmm0 +.p2align 4 +_abi_test_clobber_xmm0: + pxor %xmm0,%xmm0 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm1 +.private_extern _abi_test_clobber_xmm1 +.p2align 4 +_abi_test_clobber_xmm1: + pxor %xmm1,%xmm1 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm2 +.private_extern _abi_test_clobber_xmm2 +.p2align 4 +_abi_test_clobber_xmm2: + pxor %xmm2,%xmm2 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm3 +.private_extern _abi_test_clobber_xmm3 +.p2align 4 +_abi_test_clobber_xmm3: + pxor %xmm3,%xmm3 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm4 +.private_extern _abi_test_clobber_xmm4 +.p2align 4 +_abi_test_clobber_xmm4: + pxor %xmm4,%xmm4 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm5 +.private_extern _abi_test_clobber_xmm5 +.p2align 4 +_abi_test_clobber_xmm5: + pxor %xmm5,%xmm5 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm6 +.private_extern _abi_test_clobber_xmm6 +.p2align 4 +_abi_test_clobber_xmm6: + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm7 +.private_extern _abi_test_clobber_xmm7 +.p2align 4 +_abi_test_clobber_xmm7: + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm8 +.private_extern _abi_test_clobber_xmm8 +.p2align 4 +_abi_test_clobber_xmm8: + pxor %xmm8,%xmm8 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm9 +.private_extern _abi_test_clobber_xmm9 +.p2align 4 +_abi_test_clobber_xmm9: + pxor %xmm9,%xmm9 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm10 +.private_extern _abi_test_clobber_xmm10 +.p2align 4 +_abi_test_clobber_xmm10: + pxor %xmm10,%xmm10 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm11 +.private_extern _abi_test_clobber_xmm11 +.p2align 4 +_abi_test_clobber_xmm11: + pxor %xmm11,%xmm11 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm12 +.private_extern _abi_test_clobber_xmm12 +.p2align 4 +_abi_test_clobber_xmm12: + pxor %xmm12,%xmm12 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm13 +.private_extern _abi_test_clobber_xmm13 +.p2align 4 +_abi_test_clobber_xmm13: + pxor %xmm13,%xmm13 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm14 +.private_extern _abi_test_clobber_xmm14 +.p2align 4 +_abi_test_clobber_xmm14: + pxor %xmm14,%xmm14 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm15 +.private_extern _abi_test_clobber_xmm15 +.p2align 4 +_abi_test_clobber_xmm15: + pxor %xmm15,%xmm15 + .byte 0xf3,0xc3 + + + + + +.globl _abi_test_bad_unwind_wrong_register +.private_extern _abi_test_bad_unwind_wrong_register +.p2align 4 +_abi_test_bad_unwind_wrong_register: + +L$abi_test_bad_unwind_wrong_register_seh_begin: + pushq %r12 + +L$abi_test_bad_unwind_wrong_register_seh_push_r13: + + + + nop + popq %r12 + + .byte 0xf3,0xc3 +L$abi_test_bad_unwind_wrong_register_seh_end: + + + + + + + +.globl _abi_test_bad_unwind_temporary +.private_extern _abi_test_bad_unwind_temporary +.p2align 4 +_abi_test_bad_unwind_temporary: + +L$abi_test_bad_unwind_temporary_seh_begin: + pushq %r12 + +L$abi_test_bad_unwind_temporary_seh_push_r12: + + movq %r12,%rax + incq %rax + movq %rax,(%rsp) + + + + movq %r12,(%rsp) + + + popq %r12 + + .byte 0xf3,0xc3 +L$abi_test_bad_unwind_temporary_seh_end: + + + + + + + +.globl _abi_test_get_and_clear_direction_flag +.private_extern _abi_test_get_and_clear_direction_flag +_abi_test_get_and_clear_direction_flag: + pushfq + popq %rax + andq $0x400,%rax + shrq $10,%rax + cld + .byte 0xf3,0xc3 + + + + + +.globl _abi_test_set_direction_flag +.private_extern _abi_test_set_direction_flag +_abi_test_set_direction_flag: + std + .byte 0xf3,0xc3 + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S new file mode 100644 index 0000000000..f1e7ea4f63 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S @@ -0,0 +1,1869 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +L$p434x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF +.quad 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC +.quad 0x0004683E4E2EE688 + + +L$p434p1: +.quad 0xFDC1767AE3000000 +.quad 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056 +.quad 0x0002341F27177344 + + +.private_extern _OPENSSL_ia32cap_P +.globl _sike_fpadd +.private_extern _sike_fpadd + +_sike_fpadd: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + + xorq %rax,%rax + + movq 0(%rdi),%r8 + addq 0(%rsi),%r8 + movq 8(%rdi),%r9 + adcq 8(%rsi),%r9 + movq 16(%rdi),%r10 + adcq 16(%rsi),%r10 + movq 24(%rdi),%r11 + adcq 24(%rsi),%r11 + movq 32(%rdi),%r12 + adcq 32(%rsi),%r12 + movq 40(%rdi),%r13 + adcq 40(%rsi),%r13 + movq 48(%rdi),%r14 + adcq 48(%rsi),%r14 + + movq L$p434x2(%rip),%rcx + subq %rcx,%r8 + movq 8+L$p434x2(%rip),%rcx + sbbq %rcx,%r9 + sbbq %rcx,%r10 + movq 16+L$p434x2(%rip),%rcx + sbbq %rcx,%r11 + movq 24+L$p434x2(%rip),%rcx + sbbq %rcx,%r12 + movq 32+L$p434x2(%rip),%rcx + sbbq %rcx,%r13 + movq 40+L$p434x2(%rip),%rcx + sbbq %rcx,%r14 + + sbbq $0,%rax + + movq L$p434x2(%rip),%rdi + andq %rax,%rdi + movq 8+L$p434x2(%rip),%rsi + andq %rax,%rsi + movq 16+L$p434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+L$p434x2(%rip),%r8 + andq %rax,%r8 + movq 32+L$p434x2(%rip),%r9 + andq %rax,%r9 + movq 40+L$p434x2(%rip),%r10 + andq %rax,%r10 + btq $0,%rcx + + adcq %r8,%r12 + movq %r12,32(%rdx) + adcq %r9,%r13 + movq %r13,40(%rdx) + adcq %r10,%r14 + movq %r14,48(%rdx) + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +.globl _sike_cswap_asm +.private_extern _sike_cswap_asm + +_sike_cswap_asm: + + + movq %rdx,%xmm3 + + + + + + pshufd $68,%xmm3,%xmm3 + + movdqu 0(%rdi),%xmm0 + movdqu 0(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,0(%rdi) + movdqu %xmm1,0(%rsi) + + movdqu 16(%rdi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,16(%rdi) + movdqu %xmm1,16(%rsi) + + movdqu 32(%rdi),%xmm0 + movdqu 32(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,32(%rdi) + movdqu %xmm1,32(%rsi) + + movdqu 48(%rdi),%xmm0 + movdqu 48(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,48(%rdi) + movdqu %xmm1,48(%rsi) + + movdqu 64(%rdi),%xmm0 + movdqu 64(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,64(%rdi) + movdqu %xmm1,64(%rsi) + + movdqu 80(%rdi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,80(%rdi) + movdqu %xmm1,80(%rsi) + + movdqu 96(%rdi),%xmm0 + movdqu 96(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,96(%rdi) + movdqu %xmm1,96(%rsi) + + movdqu 112(%rdi),%xmm0 + movdqu 112(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,112(%rdi) + movdqu %xmm1,112(%rsi) + + movdqu 128(%rdi),%xmm0 + movdqu 128(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,128(%rdi) + movdqu %xmm1,128(%rsi) + + movdqu 144(%rdi),%xmm0 + movdqu 144(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,144(%rdi) + movdqu %xmm1,144(%rsi) + + movdqu 160(%rdi),%xmm0 + movdqu 160(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,160(%rdi) + movdqu %xmm1,160(%rsi) + + movdqu 176(%rdi),%xmm0 + movdqu 176(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,176(%rdi) + movdqu %xmm1,176(%rsi) + + movdqu 192(%rdi),%xmm0 + movdqu 192(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,192(%rdi) + movdqu %xmm1,192(%rsi) + + movdqu 208(%rdi),%xmm0 + movdqu 208(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,208(%rdi) + movdqu %xmm1,208(%rsi) + + .byte 0xf3,0xc3 +.globl _sike_fpsub +.private_extern _sike_fpsub + +_sike_fpsub: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + + xorq %rax,%rax + + movq 0(%rdi),%r8 + subq 0(%rsi),%r8 + movq 8(%rdi),%r9 + sbbq 8(%rsi),%r9 + movq 16(%rdi),%r10 + sbbq 16(%rsi),%r10 + movq 24(%rdi),%r11 + sbbq 24(%rsi),%r11 + movq 32(%rdi),%r12 + sbbq 32(%rsi),%r12 + movq 40(%rdi),%r13 + sbbq 40(%rsi),%r13 + movq 48(%rdi),%r14 + sbbq 48(%rsi),%r14 + + sbbq $0x0,%rax + + movq L$p434x2(%rip),%rdi + andq %rax,%rdi + movq 8+L$p434x2(%rip),%rsi + andq %rax,%rsi + movq 16+L$p434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+L$p434x2(%rip),%r8 + andq %rax,%r8 + movq 32+L$p434x2(%rip),%r9 + andq %rax,%r9 + movq 40+L$p434x2(%rip),%r10 + andq %rax,%r10 + btq $0x0,%rcx + + adcq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +.globl _sike_mpadd_asm +.private_extern _sike_mpadd_asm + +_sike_mpadd_asm: + + movq 0(%rdi),%r8; + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + adcq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + adcq 40(%rsi),%r8 + adcq 48(%rsi),%r9 + movq %r8,40(%rdx) + movq %r9,48(%rdx) + .byte 0xf3,0xc3 + +.globl _sike_mpsubx2_asm +.private_extern _sike_mpsubx2_asm + +_sike_mpsubx2_asm: + + xorq %rax,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + movq 56(%rdi),%r10 + movq 64(%rdi),%r11 + movq 72(%rdi),%rcx + sbbq 40(%rsi),%r8 + sbbq 48(%rsi),%r9 + sbbq 56(%rsi),%r10 + sbbq 64(%rsi),%r11 + sbbq 72(%rsi),%rcx + movq %r8,40(%rdx) + movq %r9,48(%rdx) + movq %r10,56(%rdx) + movq %r11,64(%rdx) + movq %rcx,72(%rdx) + + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + sbbq 80(%rsi),%r8 + sbbq 88(%rsi),%r9 + sbbq 96(%rsi),%r10 + sbbq 104(%rsi),%r11 + sbbq $0x0,%rax + movq %r8,80(%rdx) + movq %r9,88(%rdx) + movq %r10,96(%rdx) + movq %r11,104(%rdx) + .byte 0xf3,0xc3 + +.globl _sike_mpdblsubx2_asm +.private_extern _sike_mpdblsubx2_asm + +_sike_mpdblsubx2_asm: + + pushq %r12 + + + pushq %r13 + + + + xorq %rax,%rax + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + movq 48(%rdx),%rcx + subq 0(%rdi),%r8 + sbbq 8(%rdi),%r9 + sbbq 16(%rdi),%r10 + sbbq 24(%rdi),%r11 + sbbq 32(%rdi),%r12 + sbbq 40(%rdi),%r13 + sbbq 48(%rdi),%rcx + adcq $0x0,%rax + + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%r12 + sbbq 40(%rsi),%r13 + sbbq 48(%rsi),%rcx + adcq $0x0,%rax + + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %rcx,48(%rdx) + + + movq 56(%rdx),%r8 + movq 64(%rdx),%r9 + movq 72(%rdx),%r10 + movq 80(%rdx),%r11 + movq 88(%rdx),%r12 + movq 96(%rdx),%r13 + movq 104(%rdx),%rcx + + subq %rax,%r8 + sbbq 56(%rdi),%r8 + sbbq 64(%rdi),%r9 + sbbq 72(%rdi),%r10 + sbbq 80(%rdi),%r11 + sbbq 88(%rdi),%r12 + sbbq 96(%rdi),%r13 + sbbq 104(%rdi),%rcx + + + subq 56(%rsi),%r8 + sbbq 64(%rsi),%r9 + sbbq 72(%rsi),%r10 + sbbq 80(%rsi),%r11 + sbbq 88(%rsi),%r12 + sbbq 96(%rsi),%r13 + sbbq 104(%rsi),%rcx + + + movq %r8,56(%rdx) + movq %r9,64(%rdx) + movq %r10,72(%rdx) + movq %r11,80(%rdx) + movq %r12,88(%rdx) + movq %r13,96(%rdx) + movq %rcx,104(%rdx) + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + + +L$rdc_bdw: + + + + + + + + + xorq %rax,%rax + movq 0+0(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 0+8(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+L$p434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + xorq %rax,%rax + movq 16+0(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 16+8(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+L$p434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + xorq %rax,%rax + movq 32+0(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 32+8(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+L$p434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + xorq %rax,%rax + movq 48(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 + + + popq %r14 + + + popq %r13 + + + popq %r12 + + + .byte 0xf3,0xc3 + +.globl _sike_fprdc +.private_extern _sike_fprdc + +_sike_fprdc: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + + + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$rdc_bdw + + + + + movq 0+0(%rdi),%r14 + movq 0+L$p434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+L$p434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 0+8(%rdi),%rcx + movq 0+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+L$p434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + + movq 16+0(%rdi),%r14 + movq 0+L$p434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+L$p434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 16+8(%rdi),%rcx + movq 0+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+L$p434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + + movq 32+0(%rdi),%r14 + movq 0+L$p434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+L$p434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 32+8(%rdi),%rcx + movq 0+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+L$p434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + movq 48(%rdi),%r13 + + xorq %r10,%r10 + movq 0+L$p434p1(%rip),%rax + mulq %r13 + movq %rax,%r8 + movq %rdx,%r9 + + xorq %r11,%r11 + movq 8+L$p434p1(%rip),%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r11 + + movq 24+L$p434p1(%rip),%rax + mulq %r13 + addq %rax,%r11 + adcq %rdx,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +L$mul_bdw: + + + + + + + + + + movq %rdx,%rcx + xorq %rax,%rax + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + pushq %rbx + + + pushq %rbp + + + subq $96,%rsp + + + addq 32(%rdi),%r8 + adcq 40(%rdi),%r9 + adcq 48(%rdi),%r10 + adcq $0x0,%r11 + sbbq $0x0,%rax + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + + + xorq %rbx,%rbx + movq 0(%rsi),%r12 + movq 8(%rsi),%r13 + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + addq 32(%rsi),%r12 + adcq 40(%rsi),%r13 + adcq 48(%rsi),%r14 + adcq $0x0,%r15 + sbbq $0x0,%rbx + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + + + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + andq %rbx,%r8 + andq %rbx,%r9 + andq %rbx,%r10 + andq %rbx,%r11 + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + + movq 0+0(%rsp),%rdx + mulxq 32+0(%rsp),%r9,%r8 + movq %r9,0+0(%rsp) + mulxq 32+8(%rsp),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsp),%r11,%r10 + adoxq %r11,%r9 + mulxq 32+24(%rsp),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rsp),%rdx + mulxq 32+0(%rsp),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rsp) + adcxq %r15,%r13 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 32+8(%rsp),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rsp) + adcxq %r11,%r9 + mulxq 32+16(%rsp),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 32+24(%rsp),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r13 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rsp) + movq %r10,0+32(%rsp) + movq %r11,0+40(%rsp) + movq %r12,0+48(%rsp) + movq %rbx,0+56(%rsp) + + + + movq 0+0(%rdi),%rdx + mulxq 0+0(%rsi),%r9,%r8 + movq %r9,0+0(%rcx) + mulxq 0+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 0+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + mulxq 0+24(%rsi),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rdi),%rdx + mulxq 0+0(%rsi),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rcx) + adcxq %r15,%r13 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 0+8(%rsi),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rcx) + adcxq %r11,%r9 + mulxq 0+16(%rsi),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 0+24(%rsi),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r13 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rcx) + movq %r10,0+32(%rcx) + movq %r11,0+40(%rcx) + movq %r12,0+48(%rcx) + movq %rbx,0+56(%rcx) + + + + movq 32+0(%rdi),%rdx + mulxq 32+0(%rsi),%r9,%r8 + movq %r9,64+0(%rcx) + mulxq 32+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + + movq 32+8(%rdi),%rdx + mulxq 32+0(%rsi),%r12,%r11 + adoxq %rax,%r10 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r14,%r13 + adoxq %r8,%r12 + movq %r12,64+8(%rcx) + adcxq %r14,%r11 + + mulxq 32+16(%rsi),%r8,%r14 + adoxq %r9,%r11 + adcxq %r8,%r13 + adcxq %rax,%r14 + adoxq %r10,%r13 + + movq 32+16(%rdi),%rdx + mulxq 32+0(%rsi),%r8,%r9 + adoxq %rax,%r14 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r10,%r12 + adoxq %r11,%r8 + movq %r8,64+16(%rcx) + adcxq %r13,%r9 + + mulxq 32+16(%rsi),%r11,%r8 + adcxq %r14,%r12 + adcxq %rax,%r8 + adoxq %r10,%r9 + adoxq %r12,%r11 + adoxq %rax,%r8 + movq %r9,64+24(%rcx) + movq %r11,64+32(%rcx) + movq %r8,64+40(%rcx) + + + + + movq 64(%rsp),%r8 + movq 72(%rsp),%r9 + movq 80(%rsp),%r10 + movq 88(%rsp),%r11 + + movq 32(%rsp),%rax + addq %rax,%r8 + movq 40(%rsp),%rax + adcq %rax,%r9 + movq 48(%rsp),%rax + adcq %rax,%r10 + movq 56(%rsp),%rax + adcq %rax,%r11 + + + movq 0(%rsp),%r12 + movq 8(%rsp),%r13 + movq 16(%rsp),%r14 + movq 24(%rsp),%r15 + subq 0(%rcx),%r12 + sbbq 8(%rcx),%r13 + sbbq 16(%rcx),%r14 + sbbq 24(%rcx),%r15 + sbbq 32(%rcx),%r8 + sbbq 40(%rcx),%r9 + sbbq 48(%rcx),%r10 + sbbq 56(%rcx),%r11 + + + subq 64(%rcx),%r12 + sbbq 72(%rcx),%r13 + sbbq 80(%rcx),%r14 + sbbq 88(%rcx),%r15 + sbbq 96(%rcx),%r8 + sbbq 104(%rcx),%r9 + sbbq $0x0,%r10 + sbbq $0x0,%r11 + + addq 32(%rcx),%r12 + movq %r12,32(%rcx) + adcq 40(%rcx),%r13 + movq %r13,40(%rcx) + adcq 48(%rcx),%r14 + movq %r14,48(%rcx) + adcq 56(%rcx),%r15 + movq %r15,56(%rcx) + adcq 64(%rcx),%r8 + movq %r8,64(%rcx) + adcq 72(%rcx),%r9 + movq %r9,72(%rcx) + adcq 80(%rcx),%r10 + movq %r10,80(%rcx) + adcq 88(%rcx),%r11 + movq %r11,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + movq 104(%rcx),%r13 + adcq $0x0,%r13 + movq %r13,104(%rcx) + + addq $96,%rsp + + popq %rbp + + + popq %rbx + + + + + popq %r15 + + + popq %r14 + + + popq %r13 + + + popq %r12 + + + .byte 0xf3,0xc3 + + +.globl _sike_mpmul +.private_extern _sike_mpmul + +_sike_mpmul: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + + + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$mul_bdw + + + + movq %rdx,%rcx + + subq $112,%rsp + + + + xorq %rax,%rax + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + xorq %r11,%r11 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + + sbbq $0,%rax + movq %rax,64(%rsp) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + + + xorq %rdx,%rdx + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + xorq %r15,%r15 + addq 0(%rsi),%r12 + adcq 8(%rsi),%r13 + adcq 16(%rsi),%r14 + adcq 24(%rsi),%r15 + sbbq $0x0,%rdx + + movq %rdx,72(%rsp) + + + movq (%rcx),%rax + mulq %r12 + movq %rax,(%rsp) + movq %rdx,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq (%rcx),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rcx),%rax + mulq %r12 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rcx),%rax + mulq %r13 + addq %rax,%r9 + movq %r9,16(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 24(%rcx),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rcx),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rcx),%rax + mulq %r13 + addq %rax,%r10 + movq %r10,24(%rsp) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 24(%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rcx),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r11,%r11 + movq 16(%rcx),%rax + mulq %r15 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r14 + addq %rax,%r9 + movq %r9,40(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rsp) + adcq %rdx,%r11 + movq %r11,56(%rsp) + + + movq 64(%rsp),%rax + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + movq 72(%rsp),%rax + movq 0(%rcx),%r8 + andq %rax,%r8 + movq 8(%rcx),%r9 + andq %rax,%r9 + movq 16(%rcx),%r10 + andq %rax,%r10 + movq 24(%rcx),%r11 + andq %rax,%r11 + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + + + movq 32(%rsp),%rax + addq %rax,%r12 + movq 40(%rsp),%rax + adcq %rax,%r13 + movq 48(%rsp),%rax + adcq %rax,%r14 + movq 56(%rsp),%rax + adcq %rax,%r15 + movq %r12,80(%rsp) + movq %r13,88(%rsp) + movq %r14,96(%rsp) + movq %r15,104(%rsp) + + + movq (%rdi),%r11 + movq (%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,(%rcx) + movq %rdx,%r8 + + movq 16(%rdi),%r14 + movq 8(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 8(%rdi),%r12 + movq (%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 16(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq (%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,16(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq 24(%rsi),%rax + mulq %r11 + movq 24(%rdi),%r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq %r15,%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rsi),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rsi),%rax + mulq %r14 + addq %rax,%r10 + movq %r10,24(%rcx) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 24(%rsi),%rax + mulq %r12 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 8(%rsi),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rsi),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 24(%rsi),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rsi),%rax + mulq %r15 + addq %rax,%r9 + movq %r9,40(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 24(%rsi),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rcx) + adcq %rdx,%r8 + movq %r8,56(%rcx) + + + + movq 32(%rdi),%r11 + movq 32(%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,64(%rcx) + movq %rdx,%r8 + + movq 48(%rdi),%r14 + movq 40(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 40(%rdi),%r12 + movq 32(%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,72(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 48(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 32(%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 40(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,80(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 48(%rsi),%rax + mulq %r12 + xorq %r12,%r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + + movq 40(%rsi),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + movq %r10,88(%rcx) + + movq 48(%rsi),%rax + mulq %r14 + addq %rax,%r8 + adcq $0x0,%r12 + movq %r8,96(%rcx) + + addq %r12,%rdx + + + movq 0(%rsp),%r8 + subq 0(%rcx),%r8 + movq 8(%rsp),%r9 + sbbq 8(%rcx),%r9 + movq 16(%rsp),%r10 + sbbq 16(%rcx),%r10 + movq 24(%rsp),%r11 + sbbq 24(%rcx),%r11 + movq 80(%rsp),%r12 + sbbq 32(%rcx),%r12 + movq 88(%rsp),%r13 + sbbq 40(%rcx),%r13 + movq 96(%rsp),%r14 + sbbq 48(%rcx),%r14 + movq 104(%rsp),%r15 + sbbq 56(%rcx),%r15 + + + movq 64(%rcx),%rax + subq %rax,%r8 + movq 72(%rcx),%rax + sbbq %rax,%r9 + movq 80(%rcx),%rax + sbbq %rax,%r10 + movq 88(%rcx),%rax + sbbq %rax,%r11 + movq 96(%rcx),%rax + sbbq %rax,%r12 + sbbq %rdx,%r13 + sbbq $0x0,%r14 + sbbq $0x0,%r15 + + + addq 32(%rcx),%r8 + movq %r8,32(%rcx) + adcq 40(%rcx),%r9 + movq %r9,40(%rcx) + adcq 48(%rcx),%r10 + movq %r10,48(%rcx) + adcq 56(%rcx),%r11 + movq %r11,56(%rcx) + adcq 64(%rcx),%r12 + movq %r12,64(%rcx) + adcq 72(%rcx),%r13 + movq %r13,72(%rcx) + adcq 80(%rcx),%r14 + movq %r14,80(%rcx) + adcq 88(%rcx),%r15 + movq %r15,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + adcq $0x0,%rdx + movq %rdx,104(%rcx) + + addq $112,%rsp + + + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +#endif diff --git a/packager/third_party/boringssl/roll_boringssl.py b/packager/third_party/boringssl/roll_boringssl.py index 41c2ed1e50..f1f009c2f7 100755 --- a/packager/third_party/boringssl/roll_boringssl.py +++ b/packager/third_party/boringssl/roll_boringssl.py @@ -30,7 +30,6 @@ GENERATED_FILES = [ 'BUILD.generated.gni', 'BUILD.generated_tests.gni', 'boringssl.gypi', - 'boringssl_tests.gypi', 'err_data.c', ] @@ -91,10 +90,12 @@ def main(): # Clear the old generated files. for (osname, arch, _, _, _) in generate_build_files.OS_ARCH_COMBOS: path = os.path.join(BORINGSSL_PATH, osname + '-' + arch) - shutil.rmtree(path) + if os.path.exists(path): + shutil.rmtree(path) for file in GENERATED_FILES: path = os.path.join(BORINGSSL_PATH, file) - os.unlink(path) + if os.path.exists(path): + os.unlink(path) # Generate new ones. subprocess.check_call(['python', diff --git a/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm b/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm index 3ba31a2b35..7b59adf1db 100644 --- a/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm index 42ca0267e7..c3a47d88f2 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -972,10 +978,10 @@ dd 1,2,4,8 dd 16,32,64,128 dd 27,54,0,0 dd 0,0,0,0 -global _asm_AES_encrypt +global _aes_nohw_encrypt align 16 -_asm_AES_encrypt: -L$_asm_AES_encrypt_begin: +_aes_nohw_encrypt: +L$_aes_nohw_encrypt_begin: push ebp push ebx push esi @@ -2152,10 +2158,10 @@ db 160,224,59,77,174,42,245,176 db 200,235,187,60,131,83,153,97 db 23,43,4,126,186,119,214,38 db 225,105,20,99,85,33,12,125 -global _asm_AES_decrypt +global _aes_nohw_decrypt align 16 -_asm_AES_decrypt: -L$_asm_AES_decrypt_begin: +_aes_nohw_decrypt: +L$_aes_nohw_decrypt_begin: push ebp push ebx push esi @@ -2215,10 +2221,10 @@ L$011x86: pop ebx pop ebp ret -global _asm_AES_cbc_encrypt +global _aes_nohw_cbc_encrypt align 16 -_asm_AES_cbc_encrypt: -L$_asm_AES_cbc_encrypt_begin: +_aes_nohw_cbc_encrypt: +L$_aes_nohw_cbc_encrypt_begin: push ebp push ebx push esi @@ -2974,16 +2980,16 @@ L$045exit: pop ebx pop ebp ret -global _asm_AES_set_encrypt_key +global _aes_nohw_set_encrypt_key align 16 -_asm_AES_set_encrypt_key: -L$_asm_AES_set_encrypt_key_begin: +_aes_nohw_set_encrypt_key: +L$_aes_nohw_set_encrypt_key_begin: call __x86_AES_set_encrypt_key ret -global _asm_AES_set_decrypt_key +global _aes_nohw_set_decrypt_key align 16 -_asm_AES_set_decrypt_key: -L$_asm_AES_set_decrypt_key_begin: +_aes_nohw_set_decrypt_key: +L$_aes_nohw_set_decrypt_key_begin: call __x86_AES_set_encrypt_key cmp eax,0 je NEAR L$054proceed diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm index a9a595653f..0272fce460 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -15,10 +21,25 @@ section .text code align=64 section .text code %endif ;extern _OPENSSL_ia32cap_P -global _aesni_encrypt +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif +global _aes_hw_encrypt align 16 -_aesni_encrypt: -L$_aesni_encrypt_begin: +_aes_hw_encrypt: +L$_aes_hw_encrypt_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$000pic +L$000pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov eax,DWORD [4+esp] mov edx,DWORD [12+esp] movups xmm2,[eax] @@ -28,22 +49,22 @@ L$_aesni_encrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$000enc1_loop_1: +L$001enc1_loop_1: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$000enc1_loop_1 + jnz NEAR L$001enc1_loop_1 db 102,15,56,221,209 pxor xmm0,xmm0 pxor xmm1,xmm1 movups [eax],xmm2 pxor xmm2,xmm2 ret -global _aesni_decrypt +global _aes_hw_decrypt align 16 -_aesni_decrypt: -L$_aesni_decrypt_begin: +_aes_hw_decrypt: +L$_aes_hw_decrypt_begin: mov eax,DWORD [4+esp] mov edx,DWORD [12+esp] movups xmm2,[eax] @@ -53,12 +74,12 @@ L$_aesni_decrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$001dec1_loop_2: +L$002dec1_loop_2: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$001dec1_loop_2 + jnz NEAR L$002dec1_loop_2 db 102,15,56,223,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -76,7 +97,7 @@ __aesni_encrypt2: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$002enc2_loop: +L$003enc2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] @@ -84,7 +105,7 @@ db 102,15,56,220,217 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$002enc2_loop + jnz NEAR L$003enc2_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,221,208 @@ -101,7 +122,7 @@ __aesni_decrypt2: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$003dec2_loop: +L$004dec2_loop: db 102,15,56,222,209 db 102,15,56,222,217 movups xmm1,[ecx*1+edx] @@ -109,7 +130,7 @@ db 102,15,56,222,217 db 102,15,56,222,208 db 102,15,56,222,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$003dec2_loop + jnz NEAR L$004dec2_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,223,208 @@ -127,7 +148,7 @@ __aesni_encrypt3: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$004enc3_loop: +L$005enc3_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -137,7 +158,7 @@ db 102,15,56,220,208 db 102,15,56,220,216 db 102,15,56,220,224 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$004enc3_loop + jnz NEAR L$005enc3_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -157,7 +178,7 @@ __aesni_decrypt3: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$005dec3_loop: +L$006dec3_loop: db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -167,7 +188,7 @@ db 102,15,56,222,208 db 102,15,56,222,216 db 102,15,56,222,224 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$005dec3_loop + jnz NEAR L$006dec3_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -189,7 +210,7 @@ __aesni_encrypt4: neg ecx db 15,31,64,0 add ecx,16 -L$006enc4_loop: +L$007enc4_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -201,7 +222,7 @@ db 102,15,56,220,216 db 102,15,56,220,224 db 102,15,56,220,232 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$006enc4_loop + jnz NEAR L$007enc4_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -225,7 +246,7 @@ __aesni_decrypt4: neg ecx db 15,31,64,0 add ecx,16 -L$007dec4_loop: +L$008dec4_loop: db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -237,7 +258,7 @@ db 102,15,56,222,216 db 102,15,56,222,224 db 102,15,56,222,232 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$007dec4_loop + jnz NEAR L$008dec4_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -265,13 +286,13 @@ db 102,15,56,220,225 pxor xmm7,xmm0 movups xmm0,[ecx*1+edx] add ecx,16 - jmp NEAR L$008_aesni_encrypt6_inner + jmp NEAR L$009_aesni_encrypt6_inner align 16 -L$009enc6_loop: +L$010enc6_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 -L$008_aesni_encrypt6_inner: +L$009_aesni_encrypt6_inner: db 102,15,56,220,233 db 102,15,56,220,241 db 102,15,56,220,249 @@ -285,7 +306,7 @@ db 102,15,56,220,232 db 102,15,56,220,240 db 102,15,56,220,248 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$009enc6_loop + jnz NEAR L$010enc6_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -317,13 +338,13 @@ db 102,15,56,222,225 pxor xmm7,xmm0 movups xmm0,[ecx*1+edx] add ecx,16 - jmp NEAR L$010_aesni_decrypt6_inner + jmp NEAR L$011_aesni_decrypt6_inner align 16 -L$011dec6_loop: +L$012dec6_loop: db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 -L$010_aesni_decrypt6_inner: +L$011_aesni_decrypt6_inner: db 102,15,56,222,233 db 102,15,56,222,241 db 102,15,56,222,249 @@ -337,7 +358,7 @@ db 102,15,56,222,232 db 102,15,56,222,240 db 102,15,56,222,248 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$011dec6_loop + jnz NEAR L$012dec6_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -351,10 +372,10 @@ db 102,15,56,223,232 db 102,15,56,223,240 db 102,15,56,223,248 ret -global _aesni_ecb_encrypt +global _aes_hw_ecb_encrypt align 16 -_aesni_ecb_encrypt: -L$_aesni_ecb_encrypt_begin: +_aes_hw_ecb_encrypt: +L$_aes_hw_ecb_encrypt_begin: push ebp push ebx push esi @@ -365,14 +386,14 @@ L$_aesni_ecb_encrypt_begin: mov edx,DWORD [32+esp] mov ebx,DWORD [36+esp] and eax,-16 - jz NEAR L$012ecb_ret + jz NEAR L$013ecb_ret mov ecx,DWORD [240+edx] test ebx,ebx - jz NEAR L$013ecb_decrypt + jz NEAR L$014ecb_decrypt mov ebp,edx mov ebx,ecx cmp eax,96 - jb NEAR L$014ecb_enc_tail + jb NEAR L$015ecb_enc_tail movdqu xmm2,[esi] movdqu xmm3,[16+esi] movdqu xmm4,[32+esi] @@ -381,9 +402,9 @@ L$_aesni_ecb_encrypt_begin: movdqu xmm7,[80+esi] lea esi,[96+esi] sub eax,96 - jmp NEAR L$015ecb_enc_loop6_enter + jmp NEAR L$016ecb_enc_loop6_enter align 16 -L$016ecb_enc_loop6: +L$017ecb_enc_loop6: movups [edi],xmm2 movdqu xmm2,[esi] movups [16+edi],xmm3 @@ -398,12 +419,12 @@ L$016ecb_enc_loop6: lea edi,[96+edi] movdqu xmm7,[80+esi] lea esi,[96+esi] -L$015ecb_enc_loop6_enter: +L$016ecb_enc_loop6_enter: call __aesni_encrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc NEAR L$016ecb_enc_loop6 + jnc NEAR L$017ecb_enc_loop6 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 @@ -412,18 +433,18 @@ L$015ecb_enc_loop6_enter: movups [80+edi],xmm7 lea edi,[96+edi] add eax,96 - jz NEAR L$012ecb_ret -L$014ecb_enc_tail: + jz NEAR L$013ecb_ret +L$015ecb_enc_tail: movups xmm2,[esi] cmp eax,32 - jb NEAR L$017ecb_enc_one + jb NEAR L$018ecb_enc_one movups xmm3,[16+esi] - je NEAR L$018ecb_enc_two + je NEAR L$019ecb_enc_two movups xmm4,[32+esi] cmp eax,64 - jb NEAR L$019ecb_enc_three + jb NEAR L$020ecb_enc_three movups xmm5,[48+esi] - je NEAR L$020ecb_enc_four + je NEAR L$021ecb_enc_four movups xmm6,[64+esi] xorps xmm7,xmm7 call __aesni_encrypt6 @@ -432,49 +453,49 @@ L$014ecb_enc_tail: movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$017ecb_enc_one: +L$018ecb_enc_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$021enc1_loop_3: +L$022enc1_loop_3: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$021enc1_loop_3 + jnz NEAR L$022enc1_loop_3 db 102,15,56,221,209 movups [edi],xmm2 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$018ecb_enc_two: +L$019ecb_enc_two: call __aesni_encrypt2 movups [edi],xmm2 movups [16+edi],xmm3 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$019ecb_enc_three: +L$020ecb_enc_three: call __aesni_encrypt3 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$020ecb_enc_four: +L$021ecb_enc_four: call __aesni_encrypt4 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 movups [48+edi],xmm5 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$013ecb_decrypt: +L$014ecb_decrypt: mov ebp,edx mov ebx,ecx cmp eax,96 - jb NEAR L$022ecb_dec_tail + jb NEAR L$023ecb_dec_tail movdqu xmm2,[esi] movdqu xmm3,[16+esi] movdqu xmm4,[32+esi] @@ -483,9 +504,9 @@ L$013ecb_decrypt: movdqu xmm7,[80+esi] lea esi,[96+esi] sub eax,96 - jmp NEAR L$023ecb_dec_loop6_enter + jmp NEAR L$024ecb_dec_loop6_enter align 16 -L$024ecb_dec_loop6: +L$025ecb_dec_loop6: movups [edi],xmm2 movdqu xmm2,[esi] movups [16+edi],xmm3 @@ -500,12 +521,12 @@ L$024ecb_dec_loop6: lea edi,[96+edi] movdqu xmm7,[80+esi] lea esi,[96+esi] -L$023ecb_dec_loop6_enter: +L$024ecb_dec_loop6_enter: call __aesni_decrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc NEAR L$024ecb_dec_loop6 + jnc NEAR L$025ecb_dec_loop6 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 @@ -514,18 +535,18 @@ L$023ecb_dec_loop6_enter: movups [80+edi],xmm7 lea edi,[96+edi] add eax,96 - jz NEAR L$012ecb_ret -L$022ecb_dec_tail: + jz NEAR L$013ecb_ret +L$023ecb_dec_tail: movups xmm2,[esi] cmp eax,32 - jb NEAR L$025ecb_dec_one + jb NEAR L$026ecb_dec_one movups xmm3,[16+esi] - je NEAR L$026ecb_dec_two + je NEAR L$027ecb_dec_two movups xmm4,[32+esi] cmp eax,64 - jb NEAR L$027ecb_dec_three + jb NEAR L$028ecb_dec_three movups xmm5,[48+esi] - je NEAR L$028ecb_dec_four + je NEAR L$029ecb_dec_four movups xmm6,[64+esi] xorps xmm7,xmm7 call __aesni_decrypt6 @@ -534,43 +555,43 @@ L$022ecb_dec_tail: movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$025ecb_dec_one: +L$026ecb_dec_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$029dec1_loop_4: +L$030dec1_loop_4: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$029dec1_loop_4 + jnz NEAR L$030dec1_loop_4 db 102,15,56,223,209 movups [edi],xmm2 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$026ecb_dec_two: +L$027ecb_dec_two: call __aesni_decrypt2 movups [edi],xmm2 movups [16+edi],xmm3 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$027ecb_dec_three: +L$028ecb_dec_three: call __aesni_decrypt3 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$028ecb_dec_four: +L$029ecb_dec_four: call __aesni_decrypt4 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 movups [48+edi],xmm5 -L$012ecb_ret: +L$013ecb_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -584,10 +605,10 @@ L$012ecb_ret: pop ebx pop ebp ret -global _aesni_ccm64_encrypt_blocks +global _aes_hw_ccm64_encrypt_blocks align 16 -_aesni_ccm64_encrypt_blocks: -L$_aesni_ccm64_encrypt_blocks_begin: +_aes_hw_ccm64_encrypt_blocks: +L$_aes_hw_ccm64_encrypt_blocks_begin: push ebp push ebx push esi @@ -623,7 +644,7 @@ L$_aesni_ccm64_encrypt_blocks_begin: lea edx,[32+ecx*1+edx] sub ebx,ecx db 102,15,56,0,253 -L$030ccm64_enc_outer: +L$031ccm64_enc_outer: movups xmm0,[ebp] mov ecx,ebx movups xmm6,[esi] @@ -632,7 +653,7 @@ L$030ccm64_enc_outer: xorps xmm0,xmm6 xorps xmm3,xmm0 movups xmm0,[32+ebp] -L$031ccm64_enc2_loop: +L$032ccm64_enc2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] @@ -640,7 +661,7 @@ db 102,15,56,220,217 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$031ccm64_enc2_loop + jnz NEAR L$032ccm64_enc2_loop db 102,15,56,220,209 db 102,15,56,220,217 paddq xmm7,[16+esp] @@ -653,7 +674,7 @@ db 102,15,56,221,216 movups [edi],xmm6 db 102,15,56,0,213 lea edi,[16+edi] - jnz NEAR L$030ccm64_enc_outer + jnz NEAR L$031ccm64_enc_outer mov esp,DWORD [48+esp] mov edi,DWORD [40+esp] movups [edi],xmm3 @@ -670,10 +691,10 @@ db 102,15,56,0,213 pop ebx pop ebp ret -global _aesni_ccm64_decrypt_blocks +global _aes_hw_ccm64_decrypt_blocks align 16 -_aesni_ccm64_decrypt_blocks: -L$_aesni_ccm64_decrypt_blocks_begin: +_aes_hw_ccm64_decrypt_blocks: +L$_aes_hw_ccm64_decrypt_blocks_begin: push ebp push ebx push esi @@ -710,12 +731,12 @@ db 102,15,56,0,253 movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$032enc1_loop_5: +L$033enc1_loop_5: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$032enc1_loop_5 + jnz NEAR L$033enc1_loop_5 db 102,15,56,221,209 shl ebx,4 mov ecx,16 @@ -725,16 +746,16 @@ db 102,15,56,221,209 sub ecx,ebx lea edx,[32+ebx*1+ebp] mov ebx,ecx - jmp NEAR L$033ccm64_dec_outer + jmp NEAR L$034ccm64_dec_outer align 16 -L$033ccm64_dec_outer: +L$034ccm64_dec_outer: xorps xmm6,xmm2 movdqa xmm2,xmm7 movups [edi],xmm6 lea edi,[16+edi] db 102,15,56,0,213 sub eax,1 - jz NEAR L$034ccm64_dec_break + jz NEAR L$035ccm64_dec_break movups xmm0,[ebp] mov ecx,ebx movups xmm1,[16+ebp] @@ -742,7 +763,7 @@ db 102,15,56,0,213 xorps xmm2,xmm0 xorps xmm3,xmm6 movups xmm0,[32+ebp] -L$035ccm64_dec2_loop: +L$036ccm64_dec2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] @@ -750,7 +771,7 @@ db 102,15,56,220,217 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$035ccm64_dec2_loop + jnz NEAR L$036ccm64_dec2_loop movups xmm6,[esi] paddq xmm7,[16+esp] db 102,15,56,220,209 @@ -758,9 +779,9 @@ db 102,15,56,220,217 db 102,15,56,221,208 db 102,15,56,221,216 lea esi,[16+esi] - jmp NEAR L$033ccm64_dec_outer + jmp NEAR L$034ccm64_dec_outer align 16 -L$034ccm64_dec_break: +L$035ccm64_dec_break: mov ecx,DWORD [240+ebp] mov edx,ebp movups xmm0,[edx] @@ -768,12 +789,12 @@ L$034ccm64_dec_break: xorps xmm6,xmm0 lea edx,[32+edx] xorps xmm3,xmm6 -L$036enc1_loop_6: +L$037enc1_loop_6: db 102,15,56,220,217 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$036enc1_loop_6 + jnz NEAR L$037enc1_loop_6 db 102,15,56,221,217 mov esp,DWORD [48+esp] mov edi,DWORD [40+esp] @@ -791,14 +812,26 @@ db 102,15,56,221,217 pop ebx pop ebp ret -global _aesni_ctr32_encrypt_blocks +global _aes_hw_ctr32_encrypt_blocks align 16 -_aesni_ctr32_encrypt_blocks: -L$_aesni_ctr32_encrypt_blocks_begin: +_aes_hw_ctr32_encrypt_blocks: +L$_aes_hw_ctr32_encrypt_blocks_begin: push ebp push ebx push esi push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$038pic +L$038pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] mov eax,DWORD [28+esp] @@ -809,7 +842,7 @@ L$_aesni_ctr32_encrypt_blocks_begin: and esp,-16 mov DWORD [80+esp],ebp cmp eax,1 - je NEAR L$037ctr32_one_shortcut + je NEAR L$039ctr32_one_shortcut movdqu xmm7,[ebx] mov DWORD [esp],202182159 mov DWORD [4+esp],134810123 @@ -847,7 +880,7 @@ db 102,15,56,0,202 pshufd xmm2,xmm0,192 pshufd xmm3,xmm0,128 cmp eax,6 - jb NEAR L$038ctr32_tail + jb NEAR L$040ctr32_tail pxor xmm7,xmm6 shl ecx,4 mov ebx,16 @@ -856,9 +889,9 @@ db 102,15,56,0,202 sub ebx,ecx lea edx,[32+ecx*1+edx] sub eax,6 - jmp NEAR L$039ctr32_loop6 + jmp NEAR L$041ctr32_loop6 align 16 -L$039ctr32_loop6: +L$041ctr32_loop6: pshufd xmm4,xmm0,64 movdqa xmm0,[32+esp] pshufd xmm5,xmm1,192 @@ -912,27 +945,27 @@ db 102,15,56,0,202 lea edi,[96+edi] pshufd xmm3,xmm0,128 sub eax,6 - jnc NEAR L$039ctr32_loop6 + jnc NEAR L$041ctr32_loop6 add eax,6 - jz NEAR L$040ctr32_ret + jz NEAR L$042ctr32_ret movdqu xmm7,[ebp] mov edx,ebp pxor xmm7,[32+esp] mov ecx,DWORD [240+ebp] -L$038ctr32_tail: +L$040ctr32_tail: por xmm2,xmm7 cmp eax,2 - jb NEAR L$041ctr32_one + jb NEAR L$043ctr32_one pshufd xmm4,xmm0,64 por xmm3,xmm7 - je NEAR L$042ctr32_two + je NEAR L$044ctr32_two pshufd xmm5,xmm1,192 por xmm4,xmm7 cmp eax,4 - jb NEAR L$043ctr32_three + jb NEAR L$045ctr32_three pshufd xmm6,xmm1,128 por xmm5,xmm7 - je NEAR L$044ctr32_four + je NEAR L$046ctr32_four por xmm6,xmm7 call __aesni_encrypt6 movups xmm1,[esi] @@ -950,29 +983,29 @@ L$038ctr32_tail: movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$037ctr32_one_shortcut: +L$039ctr32_one_shortcut: movups xmm2,[ebx] mov ecx,DWORD [240+edx] -L$041ctr32_one: +L$043ctr32_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$045enc1_loop_7: +L$047enc1_loop_7: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$045enc1_loop_7 + jnz NEAR L$047enc1_loop_7 db 102,15,56,221,209 movups xmm6,[esi] xorps xmm6,xmm2 movups [edi],xmm6 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$042ctr32_two: +L$044ctr32_two: call __aesni_encrypt2 movups xmm5,[esi] movups xmm6,[16+esi] @@ -980,9 +1013,9 @@ L$042ctr32_two: xorps xmm3,xmm6 movups [edi],xmm2 movups [16+edi],xmm3 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$043ctr32_three: +L$045ctr32_three: call __aesni_encrypt3 movups xmm5,[esi] movups xmm6,[16+esi] @@ -993,9 +1026,9 @@ L$043ctr32_three: xorps xmm4,xmm7 movups [16+edi],xmm3 movups [32+edi],xmm4 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$044ctr32_four: +L$046ctr32_four: call __aesni_encrypt4 movups xmm6,[esi] movups xmm7,[16+esi] @@ -1009,7 +1042,7 @@ L$044ctr32_four: xorps xmm5,xmm0 movups [32+edi],xmm4 movups [48+edi],xmm5 -L$040ctr32_ret: +L$042ctr32_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -1027,10 +1060,10 @@ L$040ctr32_ret: pop ebx pop ebp ret -global _aesni_xts_encrypt +global _aes_hw_xts_encrypt align 16 -_aesni_xts_encrypt: -L$_aesni_xts_encrypt_begin: +_aes_hw_xts_encrypt: +L$_aes_hw_xts_encrypt_begin: push ebp push ebx push esi @@ -1043,12 +1076,12 @@ L$_aesni_xts_encrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$046enc1_loop_8: +L$048enc1_loop_8: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$046enc1_loop_8 + jnz NEAR L$048enc1_loop_8 db 102,15,56,221,209 mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] @@ -1072,14 +1105,14 @@ db 102,15,56,221,209 mov ebp,edx mov ebx,ecx sub eax,96 - jc NEAR L$047xts_enc_short + jc NEAR L$049xts_enc_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,[32+ecx*1+edx] - jmp NEAR L$048xts_enc_loop6 + jmp NEAR L$050xts_enc_loop6 align 16 -L$048xts_enc_loop6: +L$050xts_enc_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa [esp],xmm1 @@ -1168,23 +1201,23 @@ db 102,15,56,220,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc NEAR L$048xts_enc_loop6 + jnc NEAR L$050xts_enc_loop6 mov ecx,DWORD [240+ebp] mov edx,ebp mov ebx,ecx -L$047xts_enc_short: +L$049xts_enc_short: add eax,96 - jz NEAR L$049xts_enc_done6x + jz NEAR L$051xts_enc_done6x movdqa xmm5,xmm1 cmp eax,32 - jb NEAR L$050xts_enc_one + jb NEAR L$052xts_enc_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je NEAR L$051xts_enc_two + je NEAR L$053xts_enc_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1193,7 +1226,7 @@ L$047xts_enc_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb NEAR L$052xts_enc_three + jb NEAR L$054xts_enc_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1203,7 +1236,7 @@ L$047xts_enc_short: pxor xmm1,xmm2 movdqa [esp],xmm5 movdqa [16+esp],xmm6 - je NEAR L$053xts_enc_four + je NEAR L$055xts_enc_four movdqa [32+esp],xmm7 pshufd xmm7,xmm0,19 movdqa [48+esp],xmm1 @@ -1235,9 +1268,9 @@ L$047xts_enc_short: movups [48+edi],xmm5 movups [64+edi],xmm6 lea edi,[80+edi] - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$050xts_enc_one: +L$052xts_enc_one: movups xmm2,[esi] lea esi,[16+esi] xorps xmm2,xmm5 @@ -1245,20 +1278,20 @@ L$050xts_enc_one: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$055enc1_loop_9: +L$057enc1_loop_9: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$055enc1_loop_9 + jnz NEAR L$057enc1_loop_9 db 102,15,56,221,209 xorps xmm2,xmm5 movups [edi],xmm2 lea edi,[16+edi] movdqa xmm1,xmm5 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$051xts_enc_two: +L$053xts_enc_two: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1272,9 +1305,9 @@ L$051xts_enc_two: movups [16+edi],xmm3 lea edi,[32+edi] movdqa xmm1,xmm6 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$052xts_enc_three: +L$054xts_enc_three: movaps xmm7,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1292,9 +1325,9 @@ L$052xts_enc_three: movups [32+edi],xmm4 lea edi,[48+edi] movdqa xmm1,xmm7 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$053xts_enc_four: +L$055xts_enc_four: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1316,28 +1349,28 @@ L$053xts_enc_four: movups [48+edi],xmm5 lea edi,[64+edi] movdqa xmm1,xmm6 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$049xts_enc_done6x: +L$051xts_enc_done6x: mov eax,DWORD [112+esp] and eax,15 - jz NEAR L$056xts_enc_ret + jz NEAR L$058xts_enc_ret movdqa xmm5,xmm1 mov DWORD [112+esp],eax - jmp NEAR L$057xts_enc_steal + jmp NEAR L$059xts_enc_steal align 16 -L$054xts_enc_done: +L$056xts_enc_done: mov eax,DWORD [112+esp] pxor xmm0,xmm0 and eax,15 - jz NEAR L$056xts_enc_ret + jz NEAR L$058xts_enc_ret pcmpgtd xmm0,xmm1 mov DWORD [112+esp],eax pshufd xmm5,xmm0,19 paddq xmm1,xmm1 pand xmm5,[96+esp] pxor xmm5,xmm1 -L$057xts_enc_steal: +L$059xts_enc_steal: movzx ecx,BYTE [esi] movzx edx,BYTE [edi-16] lea esi,[1+esi] @@ -1345,7 +1378,7 @@ L$057xts_enc_steal: mov BYTE [edi],dl lea edi,[1+edi] sub eax,1 - jnz NEAR L$057xts_enc_steal + jnz NEAR L$059xts_enc_steal sub edi,DWORD [112+esp] mov edx,ebp mov ecx,ebx @@ -1355,16 +1388,16 @@ L$057xts_enc_steal: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$058enc1_loop_10: +L$060enc1_loop_10: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$058enc1_loop_10 + jnz NEAR L$060enc1_loop_10 db 102,15,56,221,209 xorps xmm2,xmm5 movups [edi-16],xmm2 -L$056xts_enc_ret: +L$058xts_enc_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -1385,10 +1418,10 @@ L$056xts_enc_ret: pop ebx pop ebp ret -global _aesni_xts_decrypt +global _aes_hw_xts_decrypt align 16 -_aesni_xts_decrypt: -L$_aesni_xts_decrypt_begin: +_aes_hw_xts_decrypt: +L$_aes_hw_xts_decrypt_begin: push ebp push ebx push esi @@ -1401,12 +1434,12 @@ L$_aesni_xts_decrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$059enc1_loop_11: +L$061enc1_loop_11: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$059enc1_loop_11 + jnz NEAR L$061enc1_loop_11 db 102,15,56,221,209 mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] @@ -1435,14 +1468,14 @@ db 102,15,56,221,209 pcmpgtd xmm0,xmm1 and eax,-16 sub eax,96 - jc NEAR L$060xts_dec_short + jc NEAR L$062xts_dec_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,[32+ecx*1+edx] - jmp NEAR L$061xts_dec_loop6 + jmp NEAR L$063xts_dec_loop6 align 16 -L$061xts_dec_loop6: +L$063xts_dec_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa [esp],xmm1 @@ -1531,23 +1564,23 @@ db 102,15,56,222,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc NEAR L$061xts_dec_loop6 + jnc NEAR L$063xts_dec_loop6 mov ecx,DWORD [240+ebp] mov edx,ebp mov ebx,ecx -L$060xts_dec_short: +L$062xts_dec_short: add eax,96 - jz NEAR L$062xts_dec_done6x + jz NEAR L$064xts_dec_done6x movdqa xmm5,xmm1 cmp eax,32 - jb NEAR L$063xts_dec_one + jb NEAR L$065xts_dec_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je NEAR L$064xts_dec_two + je NEAR L$066xts_dec_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1556,7 +1589,7 @@ L$060xts_dec_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb NEAR L$065xts_dec_three + jb NEAR L$067xts_dec_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1566,7 +1599,7 @@ L$060xts_dec_short: pxor xmm1,xmm2 movdqa [esp],xmm5 movdqa [16+esp],xmm6 - je NEAR L$066xts_dec_four + je NEAR L$068xts_dec_four movdqa [32+esp],xmm7 pshufd xmm7,xmm0,19 movdqa [48+esp],xmm1 @@ -1598,9 +1631,9 @@ L$060xts_dec_short: movups [48+edi],xmm5 movups [64+edi],xmm6 lea edi,[80+edi] - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$063xts_dec_one: +L$065xts_dec_one: movups xmm2,[esi] lea esi,[16+esi] xorps xmm2,xmm5 @@ -1608,20 +1641,20 @@ L$063xts_dec_one: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$068dec1_loop_12: +L$070dec1_loop_12: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$068dec1_loop_12 + jnz NEAR L$070dec1_loop_12 db 102,15,56,223,209 xorps xmm2,xmm5 movups [edi],xmm2 lea edi,[16+edi] movdqa xmm1,xmm5 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$064xts_dec_two: +L$066xts_dec_two: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1635,9 +1668,9 @@ L$064xts_dec_two: movups [16+edi],xmm3 lea edi,[32+edi] movdqa xmm1,xmm6 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$065xts_dec_three: +L$067xts_dec_three: movaps xmm7,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1655,9 +1688,9 @@ L$065xts_dec_three: movups [32+edi],xmm4 lea edi,[48+edi] movdqa xmm1,xmm7 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$066xts_dec_four: +L$068xts_dec_four: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1679,20 +1712,20 @@ L$066xts_dec_four: movups [48+edi],xmm5 lea edi,[64+edi] movdqa xmm1,xmm6 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$062xts_dec_done6x: +L$064xts_dec_done6x: mov eax,DWORD [112+esp] and eax,15 - jz NEAR L$069xts_dec_ret + jz NEAR L$071xts_dec_ret mov DWORD [112+esp],eax - jmp NEAR L$070xts_dec_only_one_more + jmp NEAR L$072xts_dec_only_one_more align 16 -L$067xts_dec_done: +L$069xts_dec_done: mov eax,DWORD [112+esp] pxor xmm0,xmm0 and eax,15 - jz NEAR L$069xts_dec_ret + jz NEAR L$071xts_dec_ret pcmpgtd xmm0,xmm1 mov DWORD [112+esp],eax pshufd xmm2,xmm0,19 @@ -1702,7 +1735,7 @@ L$067xts_dec_done: pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 -L$070xts_dec_only_one_more: +L$072xts_dec_only_one_more: pshufd xmm5,xmm0,19 movdqa xmm6,xmm1 paddq xmm1,xmm1 @@ -1716,16 +1749,16 @@ L$070xts_dec_only_one_more: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$071dec1_loop_13: +L$073dec1_loop_13: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$071dec1_loop_13 + jnz NEAR L$073dec1_loop_13 db 102,15,56,223,209 xorps xmm2,xmm5 movups [edi],xmm2 -L$072xts_dec_steal: +L$074xts_dec_steal: movzx ecx,BYTE [16+esi] movzx edx,BYTE [edi] lea esi,[1+esi] @@ -1733,7 +1766,7 @@ L$072xts_dec_steal: mov BYTE [16+edi],dl lea edi,[1+edi] sub eax,1 - jnz NEAR L$072xts_dec_steal + jnz NEAR L$074xts_dec_steal sub edi,DWORD [112+esp] mov edx,ebp mov ecx,ebx @@ -1743,16 +1776,16 @@ L$072xts_dec_steal: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$073dec1_loop_14: +L$075dec1_loop_14: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$073dec1_loop_14 + jnz NEAR L$075dec1_loop_14 db 102,15,56,223,209 xorps xmm2,xmm6 movups [edi],xmm2 -L$069xts_dec_ret: +L$071xts_dec_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -1773,10 +1806,10 @@ L$069xts_dec_ret: pop ebx pop ebp ret -global _aesni_cbc_encrypt +global _aes_hw_cbc_encrypt align 16 -_aesni_cbc_encrypt: -L$_aesni_cbc_encrypt_begin: +_aes_hw_cbc_encrypt: +L$_aes_hw_cbc_encrypt_begin: push ebp push ebx push esi @@ -1790,7 +1823,7 @@ L$_aesni_cbc_encrypt_begin: mov edx,DWORD [32+esp] mov ebp,DWORD [36+esp] test eax,eax - jz NEAR L$074cbc_abort + jz NEAR L$076cbc_abort cmp DWORD [40+esp],0 xchg ebx,esp movups xmm7,[ebp] @@ -1798,14 +1831,14 @@ L$_aesni_cbc_encrypt_begin: mov ebp,edx mov DWORD [16+esp],ebx mov ebx,ecx - je NEAR L$075cbc_decrypt + je NEAR L$077cbc_decrypt movaps xmm2,xmm7 cmp eax,16 - jb NEAR L$076cbc_enc_tail + jb NEAR L$078cbc_enc_tail sub eax,16 - jmp NEAR L$077cbc_enc_loop + jmp NEAR L$079cbc_enc_loop align 16 -L$077cbc_enc_loop: +L$079cbc_enc_loop: movups xmm7,[esi] lea esi,[16+esi] movups xmm0,[edx] @@ -1813,25 +1846,25 @@ L$077cbc_enc_loop: xorps xmm7,xmm0 lea edx,[32+edx] xorps xmm2,xmm7 -L$078enc1_loop_15: +L$080enc1_loop_15: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$078enc1_loop_15 + jnz NEAR L$080enc1_loop_15 db 102,15,56,221,209 mov ecx,ebx mov edx,ebp movups [edi],xmm2 lea edi,[16+edi] sub eax,16 - jnc NEAR L$077cbc_enc_loop + jnc NEAR L$079cbc_enc_loop add eax,16 - jnz NEAR L$076cbc_enc_tail + jnz NEAR L$078cbc_enc_tail movaps xmm7,xmm2 pxor xmm2,xmm2 - jmp NEAR L$079cbc_ret -L$076cbc_enc_tail: + jmp NEAR L$081cbc_ret +L$078cbc_enc_tail: mov ecx,eax dd 2767451785 mov ecx,16 @@ -1842,20 +1875,20 @@ dd 2868115081 mov ecx,ebx mov esi,edi mov edx,ebp - jmp NEAR L$077cbc_enc_loop + jmp NEAR L$079cbc_enc_loop align 16 -L$075cbc_decrypt: +L$077cbc_decrypt: cmp eax,80 - jbe NEAR L$080cbc_dec_tail + jbe NEAR L$082cbc_dec_tail movaps [esp],xmm7 sub eax,80 - jmp NEAR L$081cbc_dec_loop6_enter + jmp NEAR L$083cbc_dec_loop6_enter align 16 -L$082cbc_dec_loop6: +L$084cbc_dec_loop6: movaps [esp],xmm0 movups [edi],xmm7 lea edi,[16+edi] -L$081cbc_dec_loop6_enter: +L$083cbc_dec_loop6_enter: movdqu xmm2,[esi] movdqu xmm3,[16+esi] movdqu xmm4,[32+esi] @@ -1885,28 +1918,28 @@ L$081cbc_dec_loop6_enter: movups [64+edi],xmm6 lea edi,[80+edi] sub eax,96 - ja NEAR L$082cbc_dec_loop6 + ja NEAR L$084cbc_dec_loop6 movaps xmm2,xmm7 movaps xmm7,xmm0 add eax,80 - jle NEAR L$083cbc_dec_clear_tail_collected + jle NEAR L$085cbc_dec_clear_tail_collected movups [edi],xmm2 lea edi,[16+edi] -L$080cbc_dec_tail: +L$082cbc_dec_tail: movups xmm2,[esi] movaps xmm6,xmm2 cmp eax,16 - jbe NEAR L$084cbc_dec_one + jbe NEAR L$086cbc_dec_one movups xmm3,[16+esi] movaps xmm5,xmm3 cmp eax,32 - jbe NEAR L$085cbc_dec_two + jbe NEAR L$087cbc_dec_two movups xmm4,[32+esi] cmp eax,48 - jbe NEAR L$086cbc_dec_three + jbe NEAR L$088cbc_dec_three movups xmm5,[48+esi] cmp eax,64 - jbe NEAR L$087cbc_dec_four + jbe NEAR L$089cbc_dec_four movups xmm6,[64+esi] movaps [esp],xmm7 movups xmm2,[esi] @@ -1933,26 +1966,26 @@ L$080cbc_dec_tail: movaps xmm2,xmm6 pxor xmm6,xmm6 sub eax,80 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$084cbc_dec_one: +L$086cbc_dec_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$089dec1_loop_16: +L$091dec1_loop_16: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$089dec1_loop_16 + jnz NEAR L$091dec1_loop_16 db 102,15,56,223,209 xorps xmm2,xmm7 movaps xmm7,xmm6 sub eax,16 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$085cbc_dec_two: +L$087cbc_dec_two: call __aesni_decrypt2 xorps xmm2,xmm7 xorps xmm3,xmm6 @@ -1962,9 +1995,9 @@ L$085cbc_dec_two: lea edi,[16+edi] movaps xmm7,xmm5 sub eax,32 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$086cbc_dec_three: +L$088cbc_dec_three: call __aesni_decrypt3 xorps xmm2,xmm7 xorps xmm3,xmm6 @@ -1977,9 +2010,9 @@ L$086cbc_dec_three: lea edi,[32+edi] movups xmm7,[32+esi] sub eax,48 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$087cbc_dec_four: +L$089cbc_dec_four: call __aesni_decrypt4 movups xmm1,[16+esi] movups xmm0,[32+esi] @@ -1997,21 +2030,21 @@ L$087cbc_dec_four: movaps xmm2,xmm5 pxor xmm5,xmm5 sub eax,64 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$083cbc_dec_clear_tail_collected: +L$085cbc_dec_clear_tail_collected: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 pxor xmm6,xmm6 -L$088cbc_dec_tail_collected: +L$090cbc_dec_tail_collected: and eax,15 - jnz NEAR L$090cbc_dec_tail_partial + jnz NEAR L$092cbc_dec_tail_partial movups [edi],xmm2 pxor xmm0,xmm0 - jmp NEAR L$079cbc_ret + jmp NEAR L$081cbc_ret align 16 -L$090cbc_dec_tail_partial: +L$092cbc_dec_tail_partial: movaps [esp],xmm2 pxor xmm0,xmm0 mov ecx,16 @@ -2019,14 +2052,14 @@ L$090cbc_dec_tail_partial: sub ecx,eax dd 2767451785 movdqa [esp],xmm2 -L$079cbc_ret: +L$081cbc_ret: mov esp,DWORD [16+esp] mov ebp,DWORD [36+esp] pxor xmm2,xmm2 pxor xmm1,xmm1 movups [ebp],xmm7 pxor xmm7,xmm7 -L$074cbc_abort: +L$076cbc_abort: pop edi pop esi pop ebx @@ -2037,13 +2070,13 @@ __aesni_set_encrypt_key: push ebp push ebx test eax,eax - jz NEAR L$091bad_pointer + jz NEAR L$093bad_pointer test edx,edx - jz NEAR L$091bad_pointer - call L$092pic -L$092pic: + jz NEAR L$093bad_pointer + call L$094pic +L$094pic: pop ebx - lea ebx,[(L$key_const-L$092pic)+ebx] + lea ebx,[(L$key_const-L$094pic)+ebx] lea ebp,[_OPENSSL_ia32cap_P] movups xmm0,[eax] xorps xmm4,xmm4 @@ -2051,45 +2084,45 @@ L$092pic: lea edx,[16+edx] and ebp,268437504 cmp ecx,256 - je NEAR L$09314rounds + je NEAR L$09514rounds cmp ecx,192 - je NEAR L$09412rounds + je NEAR L$09612rounds cmp ecx,128 - jne NEAR L$095bad_keybits + jne NEAR L$097bad_keybits align 16 -L$09610rounds: +L$09810rounds: cmp ebp,268435456 - je NEAR L$09710rounds_alt + je NEAR L$09910rounds_alt mov ecx,9 movups [edx-16],xmm0 db 102,15,58,223,200,1 - call L$098key_128_cold + call L$100key_128_cold db 102,15,58,223,200,2 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,4 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,8 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,16 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,32 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,64 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,128 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,27 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,54 - call L$099key_128 + call L$101key_128 movups [edx],xmm0 mov DWORD [80+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$099key_128: +L$101key_128: movups [edx],xmm0 lea edx,[16+edx] -L$098key_128_cold: +L$100key_128_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2098,13 +2131,13 @@ L$098key_128_cold: xorps xmm0,xmm1 ret align 16 -L$09710rounds_alt: +L$09910rounds_alt: movdqa xmm5,[ebx] mov ecx,8 movdqa xmm4,[32+ebx] movdqa xmm2,xmm0 movdqu [edx-16],xmm0 -L$101loop_key128: +L$103loop_key128: db 102,15,56,0,197 db 102,15,56,221,196 pslld xmm4,1 @@ -2120,7 +2153,7 @@ db 102,15,56,221,196 movdqu [edx-16],xmm0 movdqa xmm2,xmm0 dec ecx - jnz NEAR L$101loop_key128 + jnz NEAR L$103loop_key128 movdqa xmm4,[48+ebx] db 102,15,56,0,197 db 102,15,56,221,196 @@ -2148,41 +2181,41 @@ db 102,15,56,221,196 movdqu [16+edx],xmm0 mov ecx,9 mov DWORD [96+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$09412rounds: +L$09612rounds: movq xmm2,[16+eax] cmp ebp,268435456 - je NEAR L$10212rounds_alt + je NEAR L$10412rounds_alt mov ecx,11 movups [edx-16],xmm0 db 102,15,58,223,202,1 - call L$103key_192a_cold + call L$105key_192a_cold db 102,15,58,223,202,2 - call L$104key_192b + call L$106key_192b db 102,15,58,223,202,4 - call L$105key_192a + call L$107key_192a db 102,15,58,223,202,8 - call L$104key_192b + call L$106key_192b db 102,15,58,223,202,16 - call L$105key_192a + call L$107key_192a db 102,15,58,223,202,32 - call L$104key_192b + call L$106key_192b db 102,15,58,223,202,64 - call L$105key_192a + call L$107key_192a db 102,15,58,223,202,128 - call L$104key_192b + call L$106key_192b movups [edx],xmm0 mov DWORD [48+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$105key_192a: +L$107key_192a: movups [edx],xmm0 lea edx,[16+edx] align 16 -L$103key_192a_cold: +L$105key_192a_cold: movaps xmm5,xmm2 -L$106key_192b_warm: +L$108key_192b_warm: shufps xmm4,xmm0,16 movdqa xmm3,xmm2 xorps xmm0,xmm4 @@ -2196,21 +2229,21 @@ L$106key_192b_warm: pxor xmm2,xmm3 ret align 16 -L$104key_192b: +L$106key_192b: movaps xmm3,xmm0 shufps xmm5,xmm0,68 movups [edx],xmm5 shufps xmm3,xmm2,78 movups [16+edx],xmm3 lea edx,[32+edx] - jmp NEAR L$106key_192b_warm + jmp NEAR L$108key_192b_warm align 16 -L$10212rounds_alt: +L$10412rounds_alt: movdqa xmm5,[16+ebx] movdqa xmm4,[32+ebx] mov ecx,8 movdqu [edx-16],xmm0 -L$107loop_key192: +L$109loop_key192: movq [edx],xmm2 movdqa xmm1,xmm2 db 102,15,56,0,213 @@ -2232,54 +2265,54 @@ db 102,15,56,221,212 pxor xmm2,xmm3 movdqu [edx-16],xmm0 dec ecx - jnz NEAR L$107loop_key192 + jnz NEAR L$109loop_key192 mov ecx,11 mov DWORD [32+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$09314rounds: +L$09514rounds: movups xmm2,[16+eax] lea edx,[16+edx] cmp ebp,268435456 - je NEAR L$10814rounds_alt + je NEAR L$11014rounds_alt mov ecx,13 movups [edx-32],xmm0 movups [edx-16],xmm2 db 102,15,58,223,202,1 - call L$109key_256a_cold + call L$111key_256a_cold db 102,15,58,223,200,1 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,2 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,2 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,4 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,4 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,8 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,8 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,16 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,16 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,32 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,32 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,64 - call L$111key_256a + call L$113key_256a movups [edx],xmm0 mov DWORD [16+edx],ecx xor eax,eax - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$111key_256a: +L$113key_256a: movups [edx],xmm2 lea edx,[16+edx] -L$109key_256a_cold: +L$111key_256a_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2288,7 +2321,7 @@ L$109key_256a_cold: xorps xmm0,xmm1 ret align 16 -L$110key_256b: +L$112key_256b: movups [edx],xmm0 lea edx,[16+edx] shufps xmm4,xmm2,16 @@ -2299,14 +2332,14 @@ L$110key_256b: xorps xmm2,xmm1 ret align 16 -L$10814rounds_alt: +L$11014rounds_alt: movdqa xmm5,[ebx] movdqa xmm4,[32+ebx] mov ecx,7 movdqu [edx-32],xmm0 movdqa xmm1,xmm2 movdqu [edx-16],xmm2 -L$112loop_key256: +L$114loop_key256: db 102,15,56,0,213 db 102,15,56,221,212 movdqa xmm3,xmm0 @@ -2320,7 +2353,7 @@ db 102,15,56,221,212 pxor xmm0,xmm2 movdqu [edx],xmm0 dec ecx - jz NEAR L$113done_key256 + jz NEAR L$115done_key256 pshufd xmm2,xmm0,255 pxor xmm3,xmm3 db 102,15,56,221,211 @@ -2335,11 +2368,11 @@ db 102,15,56,221,211 movdqu [16+edx],xmm2 lea edx,[32+edx] movdqa xmm1,xmm2 - jmp NEAR L$112loop_key256 -L$113done_key256: + jmp NEAR L$114loop_key256 +L$115done_key256: mov ecx,13 mov DWORD [16+edx],ecx -L$100good_key: +L$102good_key: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -2351,31 +2384,43 @@ L$100good_key: pop ebp ret align 4 -L$091bad_pointer: +L$093bad_pointer: mov eax,-1 pop ebx pop ebp ret align 4 -L$095bad_keybits: +L$097bad_keybits: pxor xmm0,xmm0 mov eax,-2 pop ebx pop ebp ret -global _aesni_set_encrypt_key +global _aes_hw_set_encrypt_key align 16 -_aesni_set_encrypt_key: -L$_aesni_set_encrypt_key_begin: +_aes_hw_set_encrypt_key: +L$_aes_hw_set_encrypt_key_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$116pic +L$116pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+3-L$116pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov eax,DWORD [4+esp] mov ecx,DWORD [8+esp] mov edx,DWORD [12+esp] call __aesni_set_encrypt_key ret -global _aesni_set_decrypt_key +global _aes_hw_set_decrypt_key align 16 -_aesni_set_decrypt_key: -L$_aesni_set_decrypt_key_begin: +_aes_hw_set_decrypt_key: +L$_aes_hw_set_decrypt_key_begin: mov eax,DWORD [4+esp] mov ecx,DWORD [8+esp] mov edx,DWORD [12+esp] @@ -2383,7 +2428,7 @@ L$_aesni_set_decrypt_key_begin: mov edx,DWORD [12+esp] shl ecx,4 test eax,eax - jnz NEAR L$114dec_key_ret + jnz NEAR L$117dec_key_ret lea eax,[16+ecx*1+edx] movups xmm0,[edx] movups xmm1,[eax] @@ -2391,7 +2436,7 @@ L$_aesni_set_decrypt_key_begin: movups [edx],xmm1 lea edx,[16+edx] lea eax,[eax-16] -L$115dec_key_inverse: +L$118dec_key_inverse: movups xmm0,[edx] movups xmm1,[eax] db 102,15,56,219,192 @@ -2401,14 +2446,14 @@ db 102,15,56,219,201 movups [16+eax],xmm0 movups [edx-16],xmm1 cmp eax,edx - ja NEAR L$115dec_key_inverse + ja NEAR L$118dec_key_inverse movups xmm0,[edx] db 102,15,56,219,192 movups [edx],xmm0 pxor xmm0,xmm0 pxor xmm1,xmm1 xor eax,eax -L$114dec_key_ret: +L$117dec_key_ret: ret align 64 L$key_const: diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm index b222040aca..a87f86d12f 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm index 5780dc841b..b6784bf928 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm new file mode 100644 index 0000000000..1d07be0aea --- /dev/null +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm @@ -0,0 +1,300 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01010000h +%error yasm version 1.1.0 or later needed. +%endif +; Yasm automatically includes .00 and complains about redefining it. +; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html +%else +$@feat.00 equ 1 +%endif +section .text code align=64 +%else +section .text code +%endif +global _gcm_gmult_ssse3 +align 16 +_gcm_gmult_ssse3: +L$_gcm_gmult_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + movdqu xmm0,[edi] + call L$000pic_point +L$000pic_point: + pop eax + movdqa xmm7,[(L$reverse_bytes-L$000pic_point)+eax] + movdqa xmm2,[(L$low4_mask-L$000pic_point)+eax] +db 102,15,56,0,199 + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov eax,5 +L$001loop_row_1: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$001loop_row_1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,5 +L$002loop_row_2: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$002loop_row_2 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,6 +L$003loop_row_3: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$003loop_row_3 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 +db 102,15,56,0,215 + movdqu [edi],xmm2 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pop edi + pop esi + pop ebx + pop ebp + ret +global _gcm_ghash_ssse3 +align 16 +_gcm_ghash_ssse3: +L$_gcm_ghash_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov ecx,DWORD [32+esp] + movdqu xmm0,[edi] + call L$004pic_point +L$004pic_point: + pop ebx + movdqa xmm7,[(L$reverse_bytes-L$004pic_point)+ebx] + and ecx,-16 +db 102,15,56,0,199 + pxor xmm3,xmm3 +L$005loop_ghash: + movdqa xmm2,[(L$low4_mask-L$004pic_point)+ebx] + movdqu xmm1,[edx] +db 102,15,56,0,207 + pxor xmm0,xmm1 + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + pxor xmm2,xmm2 + mov eax,5 +L$006loop_row_4: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$006loop_row_4 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,5 +L$007loop_row_5: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$007loop_row_5 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,6 +L$008loop_row_6: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$008loop_row_6 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + movdqa xmm0,xmm2 + lea esi,[esi-256] + lea edx,[16+edx] + sub ecx,16 + jnz NEAR L$005loop_ghash +db 102,15,56,0,199 + movdqu [edi],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +L$reverse_bytes: +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +align 16 +L$low4_mask: +dd 252645135,252645135,252645135,252645135 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm index 1d350d6a7f..753c472f04 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm index 67ee21651f..c051923082 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm index cee8c62626..0afe894e52 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm index 3e7cfcca07..b5dc26ba71 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm index 88ed0b380d..3e6b0680bc 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm index b08b05637d..81b8b8330f 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -14,6 +20,9 @@ section .text code align=64 %else section .text code %endif +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif align 64 L$_vpaes_consts: dd 218628480,235210255,168496130,67568393 @@ -465,6 +474,18 @@ L$_vpaes_set_encrypt_key_begin: push ebx push esi push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$016pic +L$016pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov esi,DWORD [20+esp] lea ebx,[esp-56] mov eax,DWORD [24+esp] @@ -478,9 +499,9 @@ L$_vpaes_set_encrypt_key_begin: mov DWORD [240+edx],ebx mov ecx,48 mov edi,0 - lea ebp,[(L$_vpaes_consts+0x30-L$016pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)] call __vpaes_schedule_core -L$016pic_point: +L$017pic_point: mov esp,DWORD [48+esp] xor eax,eax pop edi @@ -514,9 +535,9 @@ L$_vpaes_set_decrypt_key_begin: shr ecx,1 and ecx,32 xor ecx,32 - lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)] call __vpaes_schedule_core -L$017pic_point: +L$018pic_point: mov esp,DWORD [48+esp] xor eax,eax pop edi @@ -532,9 +553,21 @@ L$_vpaes_encrypt_begin: push ebx push esi push edi - lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)] +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$019pic +L$019pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + lea ebp,[(L$_vpaes_consts+0x30-L$020pic_point)] call __vpaes_preheat -L$018pic_point: +L$020pic_point: mov esi,DWORD [20+esp] lea ebx,[esp-56] mov edi,DWORD [24+esp] @@ -559,9 +592,9 @@ L$_vpaes_decrypt_begin: push ebx push esi push edi - lea ebp,[(L$_vpaes_consts+0x30-L$019pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)] call __vpaes_preheat -L$019pic_point: +L$021pic_point: mov esi,DWORD [20+esp] lea ebx,[esp-56] mov edi,DWORD [24+esp] @@ -591,7 +624,7 @@ L$_vpaes_cbc_encrypt_begin: mov eax,DWORD [28+esp] mov edx,DWORD [32+esp] sub eax,16 - jc NEAR L$020cbc_abort + jc NEAR L$022cbc_abort lea ebx,[esp-56] mov ebp,DWORD [36+esp] and ebx,-16 @@ -604,14 +637,14 @@ L$_vpaes_cbc_encrypt_begin: mov DWORD [4+esp],edx mov DWORD [8+esp],ebp mov edi,eax - lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$023pic_point)] call __vpaes_preheat -L$021pic_point: +L$023pic_point: cmp ecx,0 - je NEAR L$022cbc_dec_loop - jmp NEAR L$023cbc_enc_loop + je NEAR L$024cbc_dec_loop + jmp NEAR L$025cbc_enc_loop align 16 -L$023cbc_enc_loop: +L$025cbc_enc_loop: movdqu xmm0,[esi] pxor xmm0,xmm1 call __vpaes_encrypt_core @@ -621,10 +654,10 @@ L$023cbc_enc_loop: movdqu [esi*1+ebx],xmm0 lea esi,[16+esi] sub edi,16 - jnc NEAR L$023cbc_enc_loop - jmp NEAR L$024cbc_done + jnc NEAR L$025cbc_enc_loop + jmp NEAR L$026cbc_done align 16 -L$022cbc_dec_loop: +L$024cbc_dec_loop: movdqu xmm0,[esi] movdqa [16+esp],xmm1 movdqa [32+esp],xmm0 @@ -636,12 +669,12 @@ L$022cbc_dec_loop: movdqu [esi*1+ebx],xmm0 lea esi,[16+esi] sub edi,16 - jnc NEAR L$022cbc_dec_loop -L$024cbc_done: + jnc NEAR L$024cbc_dec_loop +L$026cbc_done: mov ebx,DWORD [8+esp] mov esp,DWORD [48+esp] movdqu [ebx],xmm1 -L$020cbc_abort: +L$022cbc_abort: pop edi pop esi pop ebx diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm index b1a4d59429..6a15ed944b 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -456,16 +462,18 @@ L$016sub: lea edx,[1+edx] jge NEAR L$016sub sbb eax,0 - and esi,eax - not eax - mov ebp,edi - and ebp,eax - or esi,ebp + mov edx,-1 + xor edx,eax + jmp NEAR L$017copy align 16 L$017copy: - mov eax,DWORD [ebx*4+esi] - mov DWORD [ebx*4+edi],eax + mov esi,DWORD [32+ebx*4+esp] + mov ebp,DWORD [ebx*4+edi] mov DWORD [32+ebx*4+esp],ecx + and esi,eax + and ebp,edx + or ebp,esi + mov DWORD [ebx*4+edi],ebp dec ebx jge NEAR L$017copy mov esp,DWORD [24+esp] diff --git a/packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm b/packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm new file mode 100644 index 0000000000..e5c7d3f7fa --- /dev/null +++ b/packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm @@ -0,0 +1,164 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01010000h +%error yasm version 1.1.0 or later needed. +%endif +; Yasm automatically includes .00 and complains about redefining it. +; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html +%else +$@feat.00 equ 1 +%endif +section .text code align=64 +%else +section .text code +%endif +global _abi_test_trampoline +align 16 +_abi_test_trampoline: +L$_abi_test_trampoline_begin: + push ebp + push ebx + push esi + push edi + mov ecx,DWORD [24+esp] + mov esi,DWORD [ecx] + mov edi,DWORD [4+ecx] + mov ebx,DWORD [8+ecx] + mov ebp,DWORD [12+ecx] + sub esp,44 + mov eax,DWORD [72+esp] + xor ecx,ecx +L$000loop: + cmp ecx,DWORD [76+esp] + jae NEAR L$001loop_done + mov edx,DWORD [ecx*4+eax] + mov DWORD [ecx*4+esp],edx + add ecx,1 + jmp NEAR L$000loop +L$001loop_done: + call DWORD [64+esp] + add esp,44 + mov ecx,DWORD [24+esp] + mov DWORD [ecx],esi + mov DWORD [4+ecx],edi + mov DWORD [8+ecx],ebx + mov DWORD [12+ecx],ebp + pop edi + pop esi + pop ebx + pop ebp + ret +global _abi_test_get_and_clear_direction_flag +align 16 +_abi_test_get_and_clear_direction_flag: +L$_abi_test_get_and_clear_direction_flag_begin: + pushfd + pop eax + and eax,1024 + shr eax,10 + cld + ret +global _abi_test_set_direction_flag +align 16 +_abi_test_set_direction_flag: +L$_abi_test_set_direction_flag_begin: + std + ret +global _abi_test_clobber_eax +align 16 +_abi_test_clobber_eax: +L$_abi_test_clobber_eax_begin: + xor eax,eax + ret +global _abi_test_clobber_ebx +align 16 +_abi_test_clobber_ebx: +L$_abi_test_clobber_ebx_begin: + xor ebx,ebx + ret +global _abi_test_clobber_ecx +align 16 +_abi_test_clobber_ecx: +L$_abi_test_clobber_ecx_begin: + xor ecx,ecx + ret +global _abi_test_clobber_edx +align 16 +_abi_test_clobber_edx: +L$_abi_test_clobber_edx_begin: + xor edx,edx + ret +global _abi_test_clobber_edi +align 16 +_abi_test_clobber_edi: +L$_abi_test_clobber_edi_begin: + xor edi,edi + ret +global _abi_test_clobber_esi +align 16 +_abi_test_clobber_esi: +L$_abi_test_clobber_esi_begin: + xor esi,esi + ret +global _abi_test_clobber_ebp +align 16 +_abi_test_clobber_ebp: +L$_abi_test_clobber_ebp_begin: + xor ebp,ebp + ret +global _abi_test_clobber_xmm0 +align 16 +_abi_test_clobber_xmm0: +L$_abi_test_clobber_xmm0_begin: + pxor xmm0,xmm0 + ret +global _abi_test_clobber_xmm1 +align 16 +_abi_test_clobber_xmm1: +L$_abi_test_clobber_xmm1_begin: + pxor xmm1,xmm1 + ret +global _abi_test_clobber_xmm2 +align 16 +_abi_test_clobber_xmm2: +L$_abi_test_clobber_xmm2_begin: + pxor xmm2,xmm2 + ret +global _abi_test_clobber_xmm3 +align 16 +_abi_test_clobber_xmm3: +L$_abi_test_clobber_xmm3_begin: + pxor xmm3,xmm3 + ret +global _abi_test_clobber_xmm4 +align 16 +_abi_test_clobber_xmm4: +L$_abi_test_clobber_xmm4_begin: + pxor xmm4,xmm4 + ret +global _abi_test_clobber_xmm5 +align 16 +_abi_test_clobber_xmm5: +L$_abi_test_clobber_xmm5_begin: + pxor xmm5,xmm5 + ret +global _abi_test_clobber_xmm6 +align 16 +_abi_test_clobber_xmm6: +L$_abi_test_clobber_xmm6_begin: + pxor xmm6,xmm6 + ret +global _abi_test_clobber_xmm7 +align 16 +_abi_test_clobber_xmm7: +L$_abi_test_clobber_xmm7_begin: + pxor xmm7,xmm7 + ret diff --git a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm index cb36246891..a3c29381e3 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -55,6 +62,7 @@ $L$SEH_begin_ChaCha20_ctr32: mov r8,QWORD[40+rsp] + cmp rdx,0 je NEAR $L$no_data mov r10,QWORD[((OPENSSL_ia32cap_P+4))] @@ -62,12 +70,19 @@ $L$SEH_begin_ChaCha20_ctr32: jnz NEAR $L$ChaCha20_ssse3 push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,64+24 + $L$ctr32_body: @@ -308,16 +323,24 @@ $L$oop_tail: $L$done: lea rsi,[((64+24+48))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$no_data: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_ctr32: ALIGN 32 @@ -334,7 +357,9 @@ $L$SEH_begin_ChaCha20_ssse3: $L$ChaCha20_ssse3: + mov r9,rsp + cmp rdx,128 ja NEAR $L$ChaCha20_4x @@ -465,10 +490,12 @@ $L$done_ssse3: movaps xmm6,XMMWORD[((-40))+r9] movaps xmm7,XMMWORD[((-24))+r9] lea rsp,[r9] + $L$ssse3_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_ssse3: ALIGN 32 @@ -485,7 +512,9 @@ $L$SEH_begin_ChaCha20_4x: $L$ChaCha20_4x: + mov r9,rsp + mov r11,r10 shr r10,32 test r10,32 @@ -1047,10 +1076,12 @@ $L$done4x: movaps xmm14,XMMWORD[((-40))+r9] movaps xmm15,XMMWORD[((-24))+r9] lea rsp,[r9] + $L$4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_4x: ALIGN 32 @@ -1067,7 +1098,9 @@ $L$SEH_begin_ChaCha20_8x: $L$ChaCha20_8x: + mov r9,rsp + sub rsp,0x280+168 and rsp,-32 movaps XMMWORD[(-168)+r9],xmm6 @@ -1683,10 +1716,12 @@ $L$done8x: movaps xmm14,XMMWORD[((-40))+r9] movaps xmm15,XMMWORD[((-24))+r9] lea rsp,[r9] + $L$8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_8x: EXTERN __imp_RtlVirtualUnwind diff --git a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm index 56dc2060a4..e711826b14 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .data data align=8 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm index ab8cf92b72..b1159ae098 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm @@ -1,7 +1,16 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + global dummy_chacha20_poly1305_asm dummy_chacha20_poly1305_asm: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm index f6a4edfa0f..329185ee67 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -160,6 +167,7 @@ DB 0xf3,0xc3 ALIGN 16 _x86_64_AES_encrypt_compact: + lea r8,[128+r14] mov edi,DWORD[((0-128))+r8] mov ebp,DWORD[((32-128))+r8] @@ -330,29 +338,38 @@ $L$enc_compact_done: xor edx,DWORD[12+r15] DB 0xf3,0xc3 + ALIGN 16 -global asm_AES_encrypt +global aes_nohw_encrypt -asm_AES_encrypt: +aes_nohw_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_encrypt: +$L$SEH_begin_aes_nohw_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -363,6 +380,7 @@ $L$SEH_begin_asm_AES_encrypt: mov QWORD[16+rsp],rsi mov QWORD[24+rsp],rax + $L$enc_prologue: mov r15,rdx @@ -389,23 +407,32 @@ $L$enc_prologue: mov r9,QWORD[16+rsp] mov rsi,QWORD[24+rsp] + mov DWORD[r9],eax mov DWORD[4+r9],ebx mov DWORD[8+r9],ecx mov DWORD[12+r9],edx mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_encrypt: + +$L$SEH_end_aes_nohw_encrypt: ALIGN 16 _x86_64_AES_decrypt: @@ -564,6 +591,7 @@ DB 0xf3,0xc3 ALIGN 16 _x86_64_AES_decrypt_compact: + lea r8,[128+r14] mov edi,DWORD[((0-128))+r8] mov ebp,DWORD[((32-128))+r8] @@ -786,29 +814,38 @@ $L$dec_compact_done: xor edx,DWORD[12+r15] DB 0xf3,0xc3 + ALIGN 16 -global asm_AES_decrypt +global aes_nohw_decrypt -asm_AES_decrypt: +aes_nohw_decrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_decrypt: +$L$SEH_begin_aes_nohw_decrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -819,6 +856,7 @@ $L$SEH_begin_asm_AES_decrypt: mov QWORD[16+rsp],rsi mov QWORD[24+rsp],rax + $L$dec_prologue: mov r15,rdx @@ -847,59 +885,81 @@ $L$dec_prologue: mov r9,QWORD[16+rsp] mov rsi,QWORD[24+rsp] + mov DWORD[r9],eax mov DWORD[4+r9],ebx mov DWORD[8+r9],ecx mov DWORD[12+r9],edx mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_decrypt: -ALIGN 16 -global asm_AES_set_encrypt_key -asm_AES_set_encrypt_key: +$L$SEH_end_aes_nohw_decrypt: +ALIGN 16 +global aes_nohw_set_encrypt_key + +aes_nohw_set_encrypt_key: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_set_encrypt_key: +$L$SEH_begin_aes_nohw_set_encrypt_key: mov rdi,rcx mov rsi,rdx mov rdx,r8 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,8 + $L$enc_key_prologue: call _x86_64_AES_set_encrypt_key mov rbp,QWORD[40+rsp] + mov rbx,QWORD[48+rsp] + add rsp,56 + $L$enc_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_set_encrypt_key: + +$L$SEH_end_aes_nohw_set_encrypt_key: ALIGN 16 _x86_64_AES_set_encrypt_key: + mov ecx,esi mov rsi,rdi mov rdi,rdx @@ -1136,26 +1196,35 @@ $L$badpointer: $L$exit: DB 0xf3,0xc3 -ALIGN 16 -global asm_AES_set_decrypt_key -asm_AES_set_decrypt_key: +ALIGN 16 +global aes_nohw_set_decrypt_key + +aes_nohw_set_decrypt_key: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_set_decrypt_key: +$L$SEH_begin_aes_nohw_set_decrypt_key: mov rdi,rcx mov rsi,rdx mov rdx,r8 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + push rdx + $L$dec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1323,27 +1392,35 @@ $L$permute: xor rax,rax $L$abort: mov r15,QWORD[8+rsp] + mov r14,QWORD[16+rsp] + mov r13,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov rbp,QWORD[40+rsp] + mov rbx,QWORD[48+rsp] + add rsp,56 + $L$dec_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_set_decrypt_key: + +$L$SEH_end_aes_nohw_set_decrypt_key: ALIGN 16 -global asm_AES_cbc_encrypt +global aes_nohw_cbc_encrypt EXTERN OPENSSL_ia32cap_P -asm_AES_cbc_encrypt: +aes_nohw_cbc_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_cbc_encrypt: +$L$SEH_begin_aes_nohw_cbc_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1352,15 +1429,25 @@ $L$SEH_begin_asm_AES_cbc_encrypt: mov r9,QWORD[48+rsp] + cmp rdx,0 je NEAR $L$cbc_epilogue pushfq + + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$cbc_prologue: cld @@ -1371,6 +1458,7 @@ $L$cbc_prologue: cmp r9,0 cmove r14,r10 + lea r10,[OPENSSL_ia32cap_P] mov r10d,DWORD[r10] cmp rdx,512 @@ -1407,7 +1495,9 @@ $L$cbc_te_ok: xchg r15,rsp + mov QWORD[16+rsp],r15 + $L$cbc_fast_body: mov QWORD[24+rsp],rdi mov QWORD[32+rsp],rsi @@ -1606,6 +1696,7 @@ $L$cbc_fast_cleanup: ALIGN 16 $L$cbc_slow_prologue: + lea rbp,[((-88))+rsp] and rbp,-64 @@ -1617,7 +1708,9 @@ $L$cbc_slow_prologue: xchg rbp,rsp + mov QWORD[16+rsp],rbp + $L$cbc_slow_body: @@ -1789,20 +1882,32 @@ $L$cbc_slow_dec_partial: ALIGN 16 $L$cbc_exit: mov rsi,QWORD[16+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] + $L$cbc_popfq: popfq + + + $L$cbc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_cbc_encrypt: + +$L$SEH_end_aes_nohw_cbc_encrypt: ALIGN 64 $L$AES_Te: DD 0xa56363c6,0xa56363c6 @@ -2814,44 +2919,44 @@ $L$common_seh_exit: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_asm_AES_encrypt wrt ..imagebase - DD $L$SEH_end_asm_AES_encrypt wrt ..imagebase - DD $L$SEH_info_asm_AES_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_nohw_encrypt wrt ..imagebase + DD $L$SEH_end_aes_nohw_encrypt wrt ..imagebase + DD $L$SEH_info_aes_nohw_encrypt wrt ..imagebase - DD $L$SEH_begin_asm_AES_decrypt wrt ..imagebase - DD $L$SEH_end_asm_AES_decrypt wrt ..imagebase - DD $L$SEH_info_asm_AES_decrypt wrt ..imagebase + DD $L$SEH_begin_aes_nohw_decrypt wrt ..imagebase + DD $L$SEH_end_aes_nohw_decrypt wrt ..imagebase + DD $L$SEH_info_aes_nohw_decrypt wrt ..imagebase - DD $L$SEH_begin_asm_AES_set_encrypt_key wrt ..imagebase - DD $L$SEH_end_asm_AES_set_encrypt_key wrt ..imagebase - DD $L$SEH_info_asm_AES_set_encrypt_key wrt ..imagebase + DD $L$SEH_begin_aes_nohw_set_encrypt_key wrt ..imagebase + DD $L$SEH_end_aes_nohw_set_encrypt_key wrt ..imagebase + DD $L$SEH_info_aes_nohw_set_encrypt_key wrt ..imagebase - DD $L$SEH_begin_asm_AES_set_decrypt_key wrt ..imagebase - DD $L$SEH_end_asm_AES_set_decrypt_key wrt ..imagebase - DD $L$SEH_info_asm_AES_set_decrypt_key wrt ..imagebase + DD $L$SEH_begin_aes_nohw_set_decrypt_key wrt ..imagebase + DD $L$SEH_end_aes_nohw_set_decrypt_key wrt ..imagebase + DD $L$SEH_info_aes_nohw_set_decrypt_key wrt ..imagebase - DD $L$SEH_begin_asm_AES_cbc_encrypt wrt ..imagebase - DD $L$SEH_end_asm_AES_cbc_encrypt wrt ..imagebase - DD $L$SEH_info_asm_AES_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_nohw_cbc_encrypt wrt ..imagebase + DD $L$SEH_end_aes_nohw_cbc_encrypt wrt ..imagebase + DD $L$SEH_info_aes_nohw_cbc_encrypt wrt ..imagebase section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_asm_AES_encrypt: +$L$SEH_info_aes_nohw_encrypt: DB 9,0,0,0 DD block_se_handler wrt ..imagebase DD $L$enc_prologue wrt ..imagebase,$L$enc_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_decrypt: +$L$SEH_info_aes_nohw_decrypt: DB 9,0,0,0 DD block_se_handler wrt ..imagebase DD $L$dec_prologue wrt ..imagebase,$L$dec_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_set_encrypt_key: +$L$SEH_info_aes_nohw_set_encrypt_key: DB 9,0,0,0 DD key_se_handler wrt ..imagebase DD $L$enc_key_prologue wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_set_decrypt_key: +$L$SEH_info_aes_nohw_set_decrypt_key: DB 9,0,0,0 DD key_se_handler wrt ..imagebase DD $L$dec_key_prologue wrt ..imagebase,$L$dec_key_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_cbc_encrypt: +$L$SEH_info_aes_nohw_cbc_encrypt: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm index 63bcd48dcb..2b51a26849 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -594,6 +601,10 @@ $L$SEH_begin_aesni_gcm_encrypt: +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+2))],1 +%endif xor r10,r10 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm index 13e9c5e5b6..342c1523ee 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm @@ -1,14 +1,26 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P -global aesni_encrypt +global aes_hw_encrypt ALIGN 16 -aesni_encrypt: +aes_hw_encrypt: + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+1))],1 +%endif movups xmm2,XMMWORD[rcx] mov eax,DWORD[240+r8] movups xmm0,XMMWORD[r8] @@ -29,10 +41,12 @@ DB 102,15,56,221,209 DB 0F3h,0C3h ;repret -global aesni_decrypt + +global aes_hw_decrypt ALIGN 16 -aesni_decrypt: +aes_hw_decrypt: + movups xmm2,XMMWORD[rcx] mov eax,DWORD[240+r8] movups xmm0,XMMWORD[r8] @@ -53,8 +67,10 @@ DB 102,15,56,223,209 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt2: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -82,8 +98,10 @@ DB 102,15,56,221,216 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt2: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -111,8 +129,10 @@ DB 102,15,56,223,216 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt3: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -145,8 +165,10 @@ DB 102,15,56,221,224 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt3: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -179,8 +201,10 @@ DB 102,15,56,223,224 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt4: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -219,8 +243,10 @@ DB 102,15,56,221,232 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt4: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -259,8 +285,10 @@ DB 102,15,56,223,232 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt6: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -313,8 +341,10 @@ DB 102,15,56,221,248 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt6: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -367,8 +397,10 @@ DB 102,15,56,223,248 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt8: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -431,8 +463,10 @@ DB 102,68,15,56,221,200 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt8: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -494,14 +528,15 @@ DB 102,68,15,56,223,192 DB 102,68,15,56,223,200 DB 0F3h,0C3h ;repret -global aesni_ecb_encrypt + +global aes_hw_ecb_encrypt ALIGN 16 -aesni_ecb_encrypt: +aes_hw_ecb_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_aesni_ecb_encrypt: +$L$SEH_begin_aes_hw_ecb_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -509,6 +544,7 @@ $L$SEH_begin_aesni_ecb_encrypt: mov r8,QWORD[40+rsp] + lea rsp,[((-88))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -864,235 +900,16 @@ $L$ecb_enc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ecb_encrypt: -global aesni_ccm64_encrypt_blocks + +$L$SEH_end_aes_hw_ecb_encrypt: +global aes_hw_ctr32_encrypt_blocks ALIGN 16 -aesni_ccm64_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_aesni_ccm64_encrypt_blocks: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rsp,[((-88))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 -$L$ccm64_enc_body: - mov eax,DWORD[240+rcx] - movdqu xmm6,XMMWORD[r8] - movdqa xmm9,XMMWORD[$L$increment64] - movdqa xmm7,XMMWORD[$L$bswap_mask] - - shl eax,4 - mov r10d,16 - lea r11,[rcx] - movdqu xmm3,XMMWORD[r9] - movdqa xmm2,xmm6 - lea rcx,[32+rax*1+rcx] -DB 102,15,56,0,247 - sub r10,rax - jmp NEAR $L$ccm64_enc_outer -ALIGN 16 -$L$ccm64_enc_outer: - movups xmm0,XMMWORD[r11] - mov rax,r10 - movups xmm8,XMMWORD[rdi] - - xorps xmm2,xmm0 - movups xmm1,XMMWORD[16+r11] - xorps xmm0,xmm8 - xorps xmm3,xmm0 - movups xmm0,XMMWORD[32+r11] - -$L$ccm64_enc2_loop: -DB 102,15,56,220,209 -DB 102,15,56,220,217 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 -DB 102,15,56,220,208 -DB 102,15,56,220,216 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ccm64_enc2_loop -DB 102,15,56,220,209 -DB 102,15,56,220,217 - paddq xmm6,xmm9 - dec rdx -DB 102,15,56,221,208 -DB 102,15,56,221,216 - - lea rdi,[16+rdi] - xorps xmm8,xmm2 - movdqa xmm2,xmm6 - movups XMMWORD[rsi],xmm8 -DB 102,15,56,0,215 - lea rsi,[16+rsi] - jnz NEAR $L$ccm64_enc_outer - - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movups XMMWORD[r9],xmm3 - pxor xmm3,xmm3 - pxor xmm8,xmm8 - pxor xmm6,xmm6 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - lea rsp,[88+rsp] -$L$ccm64_enc_ret: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ccm64_encrypt_blocks: -global aesni_ccm64_decrypt_blocks - -ALIGN 16 -aesni_ccm64_decrypt_blocks: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ccm64_decrypt_blocks: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rsp,[((-88))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 -$L$ccm64_dec_body: - mov eax,DWORD[240+rcx] - movups xmm6,XMMWORD[r8] - movdqu xmm3,XMMWORD[r9] - movdqa xmm9,XMMWORD[$L$increment64] - movdqa xmm7,XMMWORD[$L$bswap_mask] - - movaps xmm2,xmm6 - mov r10d,eax - mov r11,rcx -DB 102,15,56,0,247 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_5: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_5 -DB 102,15,56,221,209 - shl r10d,4 - mov eax,16 - movups xmm8,XMMWORD[rdi] - paddq xmm6,xmm9 - lea rdi,[16+rdi] - sub rax,r10 - lea rcx,[32+r10*1+r11] - mov r10,rax - jmp NEAR $L$ccm64_dec_outer -ALIGN 16 -$L$ccm64_dec_outer: - xorps xmm8,xmm2 - movdqa xmm2,xmm6 - movups XMMWORD[rsi],xmm8 - lea rsi,[16+rsi] -DB 102,15,56,0,215 - - sub rdx,1 - jz NEAR $L$ccm64_dec_break - - movups xmm0,XMMWORD[r11] - mov rax,r10 - movups xmm1,XMMWORD[16+r11] - xorps xmm8,xmm0 - xorps xmm2,xmm0 - xorps xmm3,xmm8 - movups xmm0,XMMWORD[32+r11] - jmp NEAR $L$ccm64_dec2_loop -ALIGN 16 -$L$ccm64_dec2_loop: -DB 102,15,56,220,209 -DB 102,15,56,220,217 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 -DB 102,15,56,220,208 -DB 102,15,56,220,216 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ccm64_dec2_loop - movups xmm8,XMMWORD[rdi] - paddq xmm6,xmm9 -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,221,208 -DB 102,15,56,221,216 - lea rdi,[16+rdi] - jmp NEAR $L$ccm64_dec_outer - -ALIGN 16 -$L$ccm64_dec_break: - - mov eax,DWORD[240+r11] - movups xmm0,XMMWORD[r11] - movups xmm1,XMMWORD[16+r11] - xorps xmm8,xmm0 - lea r11,[32+r11] - xorps xmm3,xmm8 -$L$oop_enc1_6: -DB 102,15,56,220,217 - dec eax - movups xmm1,XMMWORD[r11] - lea r11,[16+r11] - jnz NEAR $L$oop_enc1_6 -DB 102,15,56,221,217 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movups XMMWORD[r9],xmm3 - pxor xmm3,xmm3 - pxor xmm8,xmm8 - pxor xmm6,xmm6 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - lea rsp,[88+rsp] -$L$ccm64_dec_ret: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ccm64_decrypt_blocks: -global aesni_ctr32_encrypt_blocks - -ALIGN 16 -aesni_ctr32_encrypt_blocks: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ctr32_encrypt_blocks: +$L$SEH_begin_aes_hw_ctr32_encrypt_blocks: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1100,6 +917,10 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks: mov r8,QWORD[40+rsp] + +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[BORINGSSL_function_hit],1 +%endif cmp rdx,1 jne NEAR $L$ctr32_bulk @@ -1112,12 +933,12 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_enc1_7: +$L$oop_enc1_5: DB 102,15,56,220,209 dec edx movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_7 + jnz NEAR $L$oop_enc1_5 DB 102,15,56,221,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -1130,7 +951,9 @@ DB 102,15,56,221,209 ALIGN 16 $L$ctr32_bulk: lea r11,[rsp] + push rbp + sub rsp,288 and rsp,-16 movaps XMMWORD[(-168)+r11],xmm6 @@ -1686,20 +1509,23 @@ $L$ctr32_done: movaps XMMWORD[96+rsp],xmm0 movaps XMMWORD[112+rsp],xmm0 mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] + $L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ctr32_encrypt_blocks: -global aesni_xts_encrypt + +$L$SEH_end_aes_hw_ctr32_encrypt_blocks: +global aes_hw_cbc_encrypt ALIGN 16 -aesni_xts_encrypt: +aes_hw_cbc_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_aesni_xts_encrypt: +$L$SEH_begin_aes_hw_cbc_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1708,1928 +1534,6 @@ $L$SEH_begin_aesni_xts_encrypt: mov r9,QWORD[48+rsp] - lea r11,[rsp] - push rbp - sub rsp,272 - and rsp,-16 - movaps XMMWORD[(-168)+r11],xmm6 - movaps XMMWORD[(-152)+r11],xmm7 - movaps XMMWORD[(-136)+r11],xmm8 - movaps XMMWORD[(-120)+r11],xmm9 - movaps XMMWORD[(-104)+r11],xmm10 - movaps XMMWORD[(-88)+r11],xmm11 - movaps XMMWORD[(-72)+r11],xmm12 - movaps XMMWORD[(-56)+r11],xmm13 - movaps XMMWORD[(-40)+r11],xmm14 - movaps XMMWORD[(-24)+r11],xmm15 -$L$xts_enc_body: - movups xmm2,XMMWORD[r9] - mov eax,DWORD[240+r8] - mov r10d,DWORD[240+rcx] - movups xmm0,XMMWORD[r8] - movups xmm1,XMMWORD[16+r8] - lea r8,[32+r8] - xorps xmm2,xmm0 -$L$oop_enc1_8: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[r8] - lea r8,[16+r8] - jnz NEAR $L$oop_enc1_8 -DB 102,15,56,221,209 - movups xmm0,XMMWORD[rcx] - mov rbp,rcx - mov eax,r10d - shl r10d,4 - mov r9,rdx - and rdx,-16 - - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqa xmm8,XMMWORD[$L$xts_magic] - movdqa xmm15,xmm2 - pshufd xmm9,xmm2,0x5f - pxor xmm1,xmm0 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm10,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm10,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm11,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm11,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm12,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm12,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm13,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm13,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm15 - psrad xmm9,31 - paddq xmm15,xmm15 - pand xmm9,xmm8 - pxor xmm14,xmm0 - pxor xmm15,xmm9 - movaps XMMWORD[96+rsp],xmm1 - - sub rdx,16*6 - jc NEAR $L$xts_enc_short - - mov eax,16+96 - lea rcx,[32+r10*1+rbp] - sub rax,r10 - movups xmm1,XMMWORD[16+rbp] - mov r10,rax - lea r8,[$L$xts_magic] - jmp NEAR $L$xts_enc_grandloop - -ALIGN 32 -$L$xts_enc_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqa xmm8,xmm0 - movdqu xmm3,XMMWORD[16+rdi] - pxor xmm2,xmm10 - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm3,xmm11 -DB 102,15,56,220,209 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm4,xmm12 -DB 102,15,56,220,217 - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm5,xmm13 -DB 102,15,56,220,225 - movdqu xmm7,XMMWORD[80+rdi] - pxor xmm8,xmm15 - movdqa xmm9,XMMWORD[96+rsp] - pxor xmm6,xmm14 -DB 102,15,56,220,233 - movups xmm0,XMMWORD[32+rbp] - lea rdi,[96+rdi] - pxor xmm7,xmm8 - - pxor xmm10,xmm9 -DB 102,15,56,220,241 - pxor xmm11,xmm9 - movdqa XMMWORD[rsp],xmm10 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+rbp] - pxor xmm12,xmm9 - -DB 102,15,56,220,208 - pxor xmm13,xmm9 - movdqa XMMWORD[16+rsp],xmm11 -DB 102,15,56,220,216 - pxor xmm14,xmm9 - movdqa XMMWORD[32+rsp],xmm12 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - pxor xmm8,xmm9 - movdqa XMMWORD[64+rsp],xmm14 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+rbp] - movdqa XMMWORD[80+rsp],xmm8 - pshufd xmm9,xmm15,0x5f - jmp NEAR $L$xts_enc_loop6 -ALIGN 32 -$L$xts_enc_loop6: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-64))+rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-80))+rax*1+rcx] - jnz NEAR $L$xts_enc_loop6 - - movdqa xmm8,XMMWORD[r8] - movdqa xmm14,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,220,209 - paddq xmm15,xmm15 - psrad xmm14,31 -DB 102,15,56,220,217 - pand xmm14,xmm8 - movups xmm10,XMMWORD[rbp] -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 - pxor xmm15,xmm14 - movaps xmm11,xmm10 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-64))+rcx] - - movdqa xmm14,xmm9 -DB 102,15,56,220,208 - paddd xmm9,xmm9 - pxor xmm10,xmm15 -DB 102,15,56,220,216 - psrad xmm14,31 - paddq xmm15,xmm15 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - pand xmm14,xmm8 - movaps xmm12,xmm11 -DB 102,15,56,220,240 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-48))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,220,209 - pxor xmm11,xmm15 - psrad xmm14,31 -DB 102,15,56,220,217 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movdqa XMMWORD[48+rsp],xmm13 - pxor xmm15,xmm14 -DB 102,15,56,220,241 - movaps xmm13,xmm12 - movdqa xmm14,xmm9 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-32))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,220,208 - pxor xmm12,xmm15 - psrad xmm14,31 -DB 102,15,56,220,216 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 - pxor xmm15,xmm14 - movaps xmm14,xmm13 -DB 102,15,56,220,248 - - movdqa xmm0,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,220,209 - pxor xmm13,xmm15 - psrad xmm0,31 -DB 102,15,56,220,217 - paddq xmm15,xmm15 - pand xmm0,xmm8 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - pxor xmm15,xmm0 - movups xmm0,XMMWORD[rbp] -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+rbp] - - pxor xmm14,xmm15 -DB 102,15,56,221,84,36,0 - psrad xmm9,31 - paddq xmm15,xmm15 -DB 102,15,56,221,92,36,16 -DB 102,15,56,221,100,36,32 - pand xmm9,xmm8 - mov rax,r10 -DB 102,15,56,221,108,36,48 -DB 102,15,56,221,116,36,64 -DB 102,15,56,221,124,36,80 - pxor xmm15,xmm9 - - lea rsi,[96+rsi] - movups XMMWORD[(-96)+rsi],xmm2 - movups XMMWORD[(-80)+rsi],xmm3 - movups XMMWORD[(-64)+rsi],xmm4 - movups XMMWORD[(-48)+rsi],xmm5 - movups XMMWORD[(-32)+rsi],xmm6 - movups XMMWORD[(-16)+rsi],xmm7 - sub rdx,16*6 - jnc NEAR $L$xts_enc_grandloop - - mov eax,16+96 - sub eax,r10d - mov rcx,rbp - shr eax,4 - -$L$xts_enc_short: - - mov r10d,eax - pxor xmm10,xmm0 - add rdx,16*6 - jz NEAR $L$xts_enc_done - - pxor xmm11,xmm0 - cmp rdx,0x20 - jb NEAR $L$xts_enc_one - pxor xmm12,xmm0 - je NEAR $L$xts_enc_two - - pxor xmm13,xmm0 - cmp rdx,0x40 - jb NEAR $L$xts_enc_three - pxor xmm14,xmm0 - je NEAR $L$xts_enc_four - - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm2,xmm10 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm3,xmm11 - movdqu xmm6,XMMWORD[64+rdi] - lea rdi,[80+rdi] - pxor xmm4,xmm12 - pxor xmm5,xmm13 - pxor xmm6,xmm14 - pxor xmm7,xmm7 - - call _aesni_encrypt6 - - xorps xmm2,xmm10 - movdqa xmm10,xmm15 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - xorps xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - xorps xmm6,xmm14 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - movdqu XMMWORD[64+rsi],xmm6 - lea rsi,[80+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_one: - movups xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_9: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_9 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movdqa xmm10,xmm11 - movups XMMWORD[rsi],xmm2 - lea rsi,[16+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_two: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - lea rdi,[32+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - - call _aesni_encrypt2 - - xorps xmm2,xmm10 - movdqa xmm10,xmm12 - xorps xmm3,xmm11 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - lea rsi,[32+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_three: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - lea rdi,[48+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - - call _aesni_encrypt3 - - xorps xmm2,xmm10 - movdqa xmm10,xmm13 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - lea rsi,[48+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_four: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - xorps xmm2,xmm10 - movups xmm5,XMMWORD[48+rdi] - lea rdi,[64+rdi] - xorps xmm3,xmm11 - xorps xmm4,xmm12 - xorps xmm5,xmm13 - - call _aesni_encrypt4 - - pxor xmm2,xmm10 - movdqa xmm10,xmm14 - pxor xmm3,xmm11 - pxor xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - pxor xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - lea rsi,[64+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_done: - and r9,15 - jz NEAR $L$xts_enc_ret - mov rdx,r9 - -$L$xts_enc_steal: - movzx eax,BYTE[rdi] - movzx ecx,BYTE[((-16))+rsi] - lea rdi,[1+rdi] - mov BYTE[((-16))+rsi],al - mov BYTE[rsi],cl - lea rsi,[1+rsi] - sub rdx,1 - jnz NEAR $L$xts_enc_steal - - sub rsi,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[((-16))+rsi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_10: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_10 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movups XMMWORD[(-16)+rsi],xmm2 - -$L$xts_enc_ret: - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - lea rsp,[r11] -$L$xts_enc_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_xts_encrypt: -global aesni_xts_decrypt - -ALIGN 16 -aesni_xts_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_xts_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea r11,[rsp] - push rbp - sub rsp,272 - and rsp,-16 - movaps XMMWORD[(-168)+r11],xmm6 - movaps XMMWORD[(-152)+r11],xmm7 - movaps XMMWORD[(-136)+r11],xmm8 - movaps XMMWORD[(-120)+r11],xmm9 - movaps XMMWORD[(-104)+r11],xmm10 - movaps XMMWORD[(-88)+r11],xmm11 - movaps XMMWORD[(-72)+r11],xmm12 - movaps XMMWORD[(-56)+r11],xmm13 - movaps XMMWORD[(-40)+r11],xmm14 - movaps XMMWORD[(-24)+r11],xmm15 -$L$xts_dec_body: - movups xmm2,XMMWORD[r9] - mov eax,DWORD[240+r8] - mov r10d,DWORD[240+rcx] - movups xmm0,XMMWORD[r8] - movups xmm1,XMMWORD[16+r8] - lea r8,[32+r8] - xorps xmm2,xmm0 -$L$oop_enc1_11: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[r8] - lea r8,[16+r8] - jnz NEAR $L$oop_enc1_11 -DB 102,15,56,221,209 - xor eax,eax - test rdx,15 - setnz al - shl rax,4 - sub rdx,rax - - movups xmm0,XMMWORD[rcx] - mov rbp,rcx - mov eax,r10d - shl r10d,4 - mov r9,rdx - and rdx,-16 - - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqa xmm8,XMMWORD[$L$xts_magic] - movdqa xmm15,xmm2 - pshufd xmm9,xmm2,0x5f - pxor xmm1,xmm0 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm10,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm10,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm11,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm11,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm12,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm12,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm13,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm13,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm15 - psrad xmm9,31 - paddq xmm15,xmm15 - pand xmm9,xmm8 - pxor xmm14,xmm0 - pxor xmm15,xmm9 - movaps XMMWORD[96+rsp],xmm1 - - sub rdx,16*6 - jc NEAR $L$xts_dec_short - - mov eax,16+96 - lea rcx,[32+r10*1+rbp] - sub rax,r10 - movups xmm1,XMMWORD[16+rbp] - mov r10,rax - lea r8,[$L$xts_magic] - jmp NEAR $L$xts_dec_grandloop - -ALIGN 32 -$L$xts_dec_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqa xmm8,xmm0 - movdqu xmm3,XMMWORD[16+rdi] - pxor xmm2,xmm10 - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm3,xmm11 -DB 102,15,56,222,209 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm4,xmm12 -DB 102,15,56,222,217 - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm5,xmm13 -DB 102,15,56,222,225 - movdqu xmm7,XMMWORD[80+rdi] - pxor xmm8,xmm15 - movdqa xmm9,XMMWORD[96+rsp] - pxor xmm6,xmm14 -DB 102,15,56,222,233 - movups xmm0,XMMWORD[32+rbp] - lea rdi,[96+rdi] - pxor xmm7,xmm8 - - pxor xmm10,xmm9 -DB 102,15,56,222,241 - pxor xmm11,xmm9 - movdqa XMMWORD[rsp],xmm10 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+rbp] - pxor xmm12,xmm9 - -DB 102,15,56,222,208 - pxor xmm13,xmm9 - movdqa XMMWORD[16+rsp],xmm11 -DB 102,15,56,222,216 - pxor xmm14,xmm9 - movdqa XMMWORD[32+rsp],xmm12 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - pxor xmm8,xmm9 - movdqa XMMWORD[64+rsp],xmm14 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+rbp] - movdqa XMMWORD[80+rsp],xmm8 - pshufd xmm9,xmm15,0x5f - jmp NEAR $L$xts_dec_loop6 -ALIGN 32 -$L$xts_dec_loop6: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-64))+rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-80))+rax*1+rcx] - jnz NEAR $L$xts_dec_loop6 - - movdqa xmm8,XMMWORD[r8] - movdqa xmm14,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,222,209 - paddq xmm15,xmm15 - psrad xmm14,31 -DB 102,15,56,222,217 - pand xmm14,xmm8 - movups xmm10,XMMWORD[rbp] -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 - pxor xmm15,xmm14 - movaps xmm11,xmm10 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-64))+rcx] - - movdqa xmm14,xmm9 -DB 102,15,56,222,208 - paddd xmm9,xmm9 - pxor xmm10,xmm15 -DB 102,15,56,222,216 - psrad xmm14,31 - paddq xmm15,xmm15 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - pand xmm14,xmm8 - movaps xmm12,xmm11 -DB 102,15,56,222,240 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-48))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,222,209 - pxor xmm11,xmm15 - psrad xmm14,31 -DB 102,15,56,222,217 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movdqa XMMWORD[48+rsp],xmm13 - pxor xmm15,xmm14 -DB 102,15,56,222,241 - movaps xmm13,xmm12 - movdqa xmm14,xmm9 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-32))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,222,208 - pxor xmm12,xmm15 - psrad xmm14,31 -DB 102,15,56,222,216 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 - pxor xmm15,xmm14 - movaps xmm14,xmm13 -DB 102,15,56,222,248 - - movdqa xmm0,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,222,209 - pxor xmm13,xmm15 - psrad xmm0,31 -DB 102,15,56,222,217 - paddq xmm15,xmm15 - pand xmm0,xmm8 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - pxor xmm15,xmm0 - movups xmm0,XMMWORD[rbp] -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+rbp] - - pxor xmm14,xmm15 -DB 102,15,56,223,84,36,0 - psrad xmm9,31 - paddq xmm15,xmm15 -DB 102,15,56,223,92,36,16 -DB 102,15,56,223,100,36,32 - pand xmm9,xmm8 - mov rax,r10 -DB 102,15,56,223,108,36,48 -DB 102,15,56,223,116,36,64 -DB 102,15,56,223,124,36,80 - pxor xmm15,xmm9 - - lea rsi,[96+rsi] - movups XMMWORD[(-96)+rsi],xmm2 - movups XMMWORD[(-80)+rsi],xmm3 - movups XMMWORD[(-64)+rsi],xmm4 - movups XMMWORD[(-48)+rsi],xmm5 - movups XMMWORD[(-32)+rsi],xmm6 - movups XMMWORD[(-16)+rsi],xmm7 - sub rdx,16*6 - jnc NEAR $L$xts_dec_grandloop - - mov eax,16+96 - sub eax,r10d - mov rcx,rbp - shr eax,4 - -$L$xts_dec_short: - - mov r10d,eax - pxor xmm10,xmm0 - pxor xmm11,xmm0 - add rdx,16*6 - jz NEAR $L$xts_dec_done - - pxor xmm12,xmm0 - cmp rdx,0x20 - jb NEAR $L$xts_dec_one - pxor xmm13,xmm0 - je NEAR $L$xts_dec_two - - pxor xmm14,xmm0 - cmp rdx,0x40 - jb NEAR $L$xts_dec_three - je NEAR $L$xts_dec_four - - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm2,xmm10 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm3,xmm11 - movdqu xmm6,XMMWORD[64+rdi] - lea rdi,[80+rdi] - pxor xmm4,xmm12 - pxor xmm5,xmm13 - pxor xmm6,xmm14 - - call _aesni_decrypt6 - - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - xorps xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - xorps xmm6,xmm14 - movdqu XMMWORD[32+rsi],xmm4 - pxor xmm14,xmm14 - movdqu XMMWORD[48+rsi],xmm5 - pcmpgtd xmm14,xmm15 - movdqu XMMWORD[64+rsi],xmm6 - lea rsi,[80+rsi] - pshufd xmm11,xmm14,0x13 - and r9,15 - jz NEAR $L$xts_dec_ret - - movdqa xmm10,xmm15 - paddq xmm15,xmm15 - pand xmm11,xmm8 - pxor xmm11,xmm15 - jmp NEAR $L$xts_dec_done2 - -ALIGN 16 -$L$xts_dec_one: - movups xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_12: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_12 -DB 102,15,56,223,209 - xorps xmm2,xmm10 - movdqa xmm10,xmm11 - movups XMMWORD[rsi],xmm2 - movdqa xmm11,xmm12 - lea rsi,[16+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_two: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - lea rdi,[32+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - - call _aesni_decrypt2 - - xorps xmm2,xmm10 - movdqa xmm10,xmm12 - xorps xmm3,xmm11 - movdqa xmm11,xmm13 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - lea rsi,[32+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_three: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - lea rdi,[48+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - - call _aesni_decrypt3 - - xorps xmm2,xmm10 - movdqa xmm10,xmm13 - xorps xmm3,xmm11 - movdqa xmm11,xmm14 - xorps xmm4,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - lea rsi,[48+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_four: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - xorps xmm2,xmm10 - movups xmm5,XMMWORD[48+rdi] - lea rdi,[64+rdi] - xorps xmm3,xmm11 - xorps xmm4,xmm12 - xorps xmm5,xmm13 - - call _aesni_decrypt4 - - pxor xmm2,xmm10 - movdqa xmm10,xmm14 - pxor xmm3,xmm11 - movdqa xmm11,xmm15 - pxor xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - pxor xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - lea rsi,[64+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_done: - and r9,15 - jz NEAR $L$xts_dec_ret -$L$xts_dec_done2: - mov rdx,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[rdi] - xorps xmm2,xmm11 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_13: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_13 -DB 102,15,56,223,209 - xorps xmm2,xmm11 - movups XMMWORD[rsi],xmm2 - -$L$xts_dec_steal: - movzx eax,BYTE[16+rdi] - movzx ecx,BYTE[rsi] - lea rdi,[1+rdi] - mov BYTE[rsi],al - mov BYTE[16+rsi],cl - lea rsi,[1+rsi] - sub rdx,1 - jnz NEAR $L$xts_dec_steal - - sub rsi,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[rsi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_14: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_14 -DB 102,15,56,223,209 - xorps xmm2,xmm10 - movups XMMWORD[rsi],xmm2 - -$L$xts_dec_ret: - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - lea rsp,[r11] -$L$xts_dec_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_xts_decrypt: -global aesni_ocb_encrypt - -ALIGN 32 -aesni_ocb_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ocb_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rax,[rsp] - push rbx - push rbp - push r12 - push r13 - push r14 - lea rsp,[((-160))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[96+rsp],xmm12 - movaps XMMWORD[112+rsp],xmm13 - movaps XMMWORD[128+rsp],xmm14 - movaps XMMWORD[144+rsp],xmm15 -$L$ocb_enc_body: - mov rbx,QWORD[56+rax] - mov rbp,QWORD[((56+8))+rax] - - mov r10d,DWORD[240+rcx] - mov r11,rcx - shl r10d,4 - movups xmm9,XMMWORD[rcx] - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqu xmm15,XMMWORD[r9] - pxor xmm9,xmm1 - pxor xmm15,xmm1 - - mov eax,16+32 - lea rcx,[32+r10*1+r11] - movups xmm1,XMMWORD[16+r11] - sub rax,r10 - mov r10,rax - - movdqu xmm10,XMMWORD[rbx] - movdqu xmm8,XMMWORD[rbp] - - test r8,1 - jnz NEAR $L$ocb_enc_odd - - bsf r12,r8 - add r8,1 - shl r12,4 - movdqu xmm7,XMMWORD[r12*1+rbx] - movdqu xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - - call __ocb_encrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - lea rsi,[16+rsi] - sub rdx,1 - jz NEAR $L$ocb_enc_done - -$L$ocb_enc_odd: - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - lea r8,[6+r8] - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - shl r12,4 - shl r13,4 - shl r14,4 - - sub rdx,6 - jc NEAR $L$ocb_enc_short - jmp NEAR $L$ocb_enc_grandloop - -ALIGN 32 -$L$ocb_enc_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - movdqu xmm5,XMMWORD[48+rdi] - movdqu xmm6,XMMWORD[64+rdi] - movdqu xmm7,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - call __ocb_encrypt6 - - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - movups XMMWORD[64+rsi],xmm6 - movups XMMWORD[80+rsi],xmm7 - lea rsi,[96+rsi] - sub rdx,6 - jnc NEAR $L$ocb_enc_grandloop - -$L$ocb_enc_short: - add rdx,6 - jz NEAR $L$ocb_enc_done - - movdqu xmm2,XMMWORD[rdi] - cmp rdx,2 - jb NEAR $L$ocb_enc_one - movdqu xmm3,XMMWORD[16+rdi] - je NEAR $L$ocb_enc_two - - movdqu xmm4,XMMWORD[32+rdi] - cmp rdx,4 - jb NEAR $L$ocb_enc_three - movdqu xmm5,XMMWORD[48+rdi] - je NEAR $L$ocb_enc_four - - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm7,xmm7 - - call __ocb_encrypt6 - - movdqa xmm15,xmm14 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - movups XMMWORD[64+rsi],xmm6 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_one: - movdqa xmm7,xmm10 - - call __ocb_encrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_two: - pxor xmm4,xmm4 - pxor xmm5,xmm5 - - call __ocb_encrypt4 - - movdqa xmm15,xmm11 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_three: - pxor xmm5,xmm5 - - call __ocb_encrypt4 - - movdqa xmm15,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_four: - call __ocb_encrypt4 - - movdqa xmm15,xmm13 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - -$L$ocb_enc_done: - pxor xmm15,xmm0 - movdqu XMMWORD[rbp],xmm8 - movdqu XMMWORD[r9],xmm15 - - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - movaps xmm10,XMMWORD[64+rsp] - movaps XMMWORD[64+rsp],xmm0 - movaps xmm11,XMMWORD[80+rsp] - movaps XMMWORD[80+rsp],xmm0 - movaps xmm12,XMMWORD[96+rsp] - movaps XMMWORD[96+rsp],xmm0 - movaps xmm13,XMMWORD[112+rsp] - movaps XMMWORD[112+rsp],xmm0 - movaps xmm14,XMMWORD[128+rsp] - movaps XMMWORD[128+rsp],xmm0 - movaps xmm15,XMMWORD[144+rsp] - movaps XMMWORD[144+rsp],xmm0 - lea rax,[((160+40))+rsp] -$L$ocb_enc_pop: - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$ocb_enc_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ocb_encrypt: - - -ALIGN 32 -__ocb_encrypt6: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - movdqa xmm14,xmm10 - pxor xmm10,xmm15 - movdqu xmm15,XMMWORD[r14*1+rbx] - pxor xmm11,xmm10 - pxor xmm8,xmm2 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm8,xmm3 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm8,xmm4 - pxor xmm4,xmm12 - pxor xmm14,xmm13 - pxor xmm8,xmm5 - pxor xmm5,xmm13 - pxor xmm15,xmm14 - pxor xmm8,xmm6 - pxor xmm6,xmm14 - pxor xmm8,xmm7 - pxor xmm7,xmm15 - movups xmm0,XMMWORD[32+r11] - - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - add r8,6 - pxor xmm10,xmm9 - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - pxor xmm11,xmm9 - pxor xmm12,xmm9 -DB 102,15,56,220,241 - pxor xmm13,xmm9 - pxor xmm14,xmm9 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+r11] - pxor xmm15,xmm9 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+r11] - shl r12,4 - shl r13,4 - jmp NEAR $L$ocb_enc_loop6 - -ALIGN 32 -$L$ocb_enc_loop6: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop6 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+r11] - shl r14,4 - -DB 102,65,15,56,221,210 - movdqu xmm10,XMMWORD[rbx] - mov rax,r10 -DB 102,65,15,56,221,219 -DB 102,65,15,56,221,228 -DB 102,65,15,56,221,237 -DB 102,65,15,56,221,246 -DB 102,65,15,56,221,255 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_encrypt4: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - pxor xmm10,xmm15 - pxor xmm11,xmm10 - pxor xmm8,xmm2 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm8,xmm3 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm8,xmm4 - pxor xmm4,xmm12 - pxor xmm8,xmm5 - pxor xmm5,xmm13 - movups xmm0,XMMWORD[32+r11] - - pxor xmm10,xmm9 - pxor xmm11,xmm9 - pxor xmm12,xmm9 - pxor xmm13,xmm9 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[48+r11] - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_enc_loop4 - -ALIGN 32 -$L$ocb_enc_loop4: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop4 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,65,15,56,221,210 -DB 102,65,15,56,221,219 -DB 102,65,15,56,221,228 -DB 102,65,15,56,221,237 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_encrypt1: - pxor xmm7,xmm15 - pxor xmm7,xmm9 - pxor xmm8,xmm2 - pxor xmm2,xmm7 - movups xmm0,XMMWORD[32+r11] - -DB 102,15,56,220,209 - movups xmm1,XMMWORD[48+r11] - pxor xmm7,xmm9 - -DB 102,15,56,220,208 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_enc_loop1 - -ALIGN 32 -$L$ocb_enc_loop1: -DB 102,15,56,220,209 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop1 - -DB 102,15,56,220,209 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,15,56,221,215 - DB 0F3h,0C3h ;repret - - -global aesni_ocb_decrypt - -ALIGN 32 -aesni_ocb_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ocb_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rax,[rsp] - push rbx - push rbp - push r12 - push r13 - push r14 - lea rsp,[((-160))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[96+rsp],xmm12 - movaps XMMWORD[112+rsp],xmm13 - movaps XMMWORD[128+rsp],xmm14 - movaps XMMWORD[144+rsp],xmm15 -$L$ocb_dec_body: - mov rbx,QWORD[56+rax] - mov rbp,QWORD[((56+8))+rax] - - mov r10d,DWORD[240+rcx] - mov r11,rcx - shl r10d,4 - movups xmm9,XMMWORD[rcx] - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqu xmm15,XMMWORD[r9] - pxor xmm9,xmm1 - pxor xmm15,xmm1 - - mov eax,16+32 - lea rcx,[32+r10*1+r11] - movups xmm1,XMMWORD[16+r11] - sub rax,r10 - mov r10,rax - - movdqu xmm10,XMMWORD[rbx] - movdqu xmm8,XMMWORD[rbp] - - test r8,1 - jnz NEAR $L$ocb_dec_odd - - bsf r12,r8 - add r8,1 - shl r12,4 - movdqu xmm7,XMMWORD[r12*1+rbx] - movdqu xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - - call __ocb_decrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - lea rsi,[16+rsi] - sub rdx,1 - jz NEAR $L$ocb_dec_done - -$L$ocb_dec_odd: - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - lea r8,[6+r8] - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - shl r12,4 - shl r13,4 - shl r14,4 - - sub rdx,6 - jc NEAR $L$ocb_dec_short - jmp NEAR $L$ocb_dec_grandloop - -ALIGN 32 -$L$ocb_dec_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - movdqu xmm5,XMMWORD[48+rdi] - movdqu xmm6,XMMWORD[64+rdi] - movdqu xmm7,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - call __ocb_decrypt6 - - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - movups XMMWORD[64+rsi],xmm6 - pxor xmm8,xmm6 - movups XMMWORD[80+rsi],xmm7 - pxor xmm8,xmm7 - lea rsi,[96+rsi] - sub rdx,6 - jnc NEAR $L$ocb_dec_grandloop - -$L$ocb_dec_short: - add rdx,6 - jz NEAR $L$ocb_dec_done - - movdqu xmm2,XMMWORD[rdi] - cmp rdx,2 - jb NEAR $L$ocb_dec_one - movdqu xmm3,XMMWORD[16+rdi] - je NEAR $L$ocb_dec_two - - movdqu xmm4,XMMWORD[32+rdi] - cmp rdx,4 - jb NEAR $L$ocb_dec_three - movdqu xmm5,XMMWORD[48+rdi] - je NEAR $L$ocb_dec_four - - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm7,xmm7 - - call __ocb_decrypt6 - - movdqa xmm15,xmm14 - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - movups XMMWORD[64+rsi],xmm6 - pxor xmm8,xmm6 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_one: - movdqa xmm7,xmm10 - - call __ocb_decrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_two: - pxor xmm4,xmm4 - pxor xmm5,xmm5 - - call __ocb_decrypt4 - - movdqa xmm15,xmm11 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - xorps xmm8,xmm3 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_three: - pxor xmm5,xmm5 - - call __ocb_decrypt4 - - movdqa xmm15,xmm12 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - xorps xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - xorps xmm8,xmm4 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_four: - call __ocb_decrypt4 - - movdqa xmm15,xmm13 - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - -$L$ocb_dec_done: - pxor xmm15,xmm0 - movdqu XMMWORD[rbp],xmm8 - movdqu XMMWORD[r9],xmm15 - - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - movaps xmm10,XMMWORD[64+rsp] - movaps XMMWORD[64+rsp],xmm0 - movaps xmm11,XMMWORD[80+rsp] - movaps XMMWORD[80+rsp],xmm0 - movaps xmm12,XMMWORD[96+rsp] - movaps XMMWORD[96+rsp],xmm0 - movaps xmm13,XMMWORD[112+rsp] - movaps XMMWORD[112+rsp],xmm0 - movaps xmm14,XMMWORD[128+rsp] - movaps XMMWORD[128+rsp],xmm0 - movaps xmm15,XMMWORD[144+rsp] - movaps XMMWORD[144+rsp],xmm0 - lea rax,[((160+40))+rsp] -$L$ocb_dec_pop: - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$ocb_dec_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ocb_decrypt: - - -ALIGN 32 -__ocb_decrypt6: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - movdqa xmm14,xmm10 - pxor xmm10,xmm15 - movdqu xmm15,XMMWORD[r14*1+rbx] - pxor xmm11,xmm10 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm4,xmm12 - pxor xmm14,xmm13 - pxor xmm5,xmm13 - pxor xmm15,xmm14 - pxor xmm6,xmm14 - pxor xmm7,xmm15 - movups xmm0,XMMWORD[32+r11] - - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - add r8,6 - pxor xmm10,xmm9 - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - pxor xmm11,xmm9 - pxor xmm12,xmm9 -DB 102,15,56,222,241 - pxor xmm13,xmm9 - pxor xmm14,xmm9 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+r11] - pxor xmm15,xmm9 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+r11] - shl r12,4 - shl r13,4 - jmp NEAR $L$ocb_dec_loop6 - -ALIGN 32 -$L$ocb_dec_loop6: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop6 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+r11] - shl r14,4 - -DB 102,65,15,56,223,210 - movdqu xmm10,XMMWORD[rbx] - mov rax,r10 -DB 102,65,15,56,223,219 -DB 102,65,15,56,223,228 -DB 102,65,15,56,223,237 -DB 102,65,15,56,223,246 -DB 102,65,15,56,223,255 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_decrypt4: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - pxor xmm10,xmm15 - pxor xmm11,xmm10 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm4,xmm12 - pxor xmm5,xmm13 - movups xmm0,XMMWORD[32+r11] - - pxor xmm10,xmm9 - pxor xmm11,xmm9 - pxor xmm12,xmm9 - pxor xmm13,xmm9 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[48+r11] - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_dec_loop4 - -ALIGN 32 -$L$ocb_dec_loop4: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop4 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,65,15,56,223,210 -DB 102,65,15,56,223,219 -DB 102,65,15,56,223,228 -DB 102,65,15,56,223,237 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_decrypt1: - pxor xmm7,xmm15 - pxor xmm7,xmm9 - pxor xmm2,xmm7 - movups xmm0,XMMWORD[32+r11] - -DB 102,15,56,222,209 - movups xmm1,XMMWORD[48+r11] - pxor xmm7,xmm9 - -DB 102,15,56,222,208 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_dec_loop1 - -ALIGN 32 -$L$ocb_dec_loop1: -DB 102,15,56,222,209 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop1 - -DB 102,15,56,222,209 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,15,56,223,215 - DB 0F3h,0C3h ;repret - -global aesni_cbc_encrypt - -ALIGN 16 -aesni_cbc_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - test rdx,rdx jz NEAR $L$cbc_ret @@ -3655,12 +1559,12 @@ $L$cbc_enc_loop: xorps xmm3,xmm0 lea rcx,[32+rcx] xorps xmm2,xmm3 -$L$oop_enc1_15: +$L$oop_enc1_6: DB 102,15,56,220,209 dec eax movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_15 + jnz NEAR $L$oop_enc1_6 DB 102,15,56,221,209 mov eax,r10d mov rcx,r11 @@ -3706,12 +1610,12 @@ $L$cbc_decrypt: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_16: +$L$oop_dec1_7: DB 102,15,56,222,209 dec r10d movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_16 + jnz NEAR $L$oop_dec1_7 DB 102,15,56,223,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -3724,7 +1628,9 @@ DB 102,15,56,223,209 ALIGN 16 $L$cbc_decrypt_bulk: lea r11,[rsp] + push rbp + sub rsp,176 and rsp,-16 movaps XMMWORD[16+rsp],xmm6 @@ -4133,12 +2039,12 @@ $L$cbc_dec_one: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_17: +$L$oop_dec1_8: DB 102,15,56,222,209 dec eax movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_17 + jnz NEAR $L$oop_dec1_8 DB 102,15,56,223,209 xorps xmm2,xmm10 movaps xmm10,xmm11 @@ -4236,17 +2142,22 @@ $L$cbc_dec_ret: movaps xmm15,XMMWORD[160+rsp] movaps XMMWORD[160+rsp],xmm0 mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] + $L$cbc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_cbc_encrypt: -global aesni_set_decrypt_key + +$L$SEH_end_aes_hw_cbc_encrypt: +global aes_hw_set_decrypt_key ALIGN 16 -aesni_set_decrypt_key: +aes_hw_set_decrypt_key: + DB 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shl edx,4 test eax,eax @@ -4279,15 +2190,22 @@ DB 102,15,56,219,192 pxor xmm0,xmm0 $L$dec_key_ret: add rsp,8 + DB 0F3h,0C3h ;repret + $L$SEH_end_set_decrypt_key: -global aesni_set_encrypt_key +global aes_hw_set_encrypt_key ALIGN 16 -aesni_set_encrypt_key: +aes_hw_set_encrypt_key: __aesni_set_encrypt_key: + +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[((BORINGSSL_function_hit+3))],1 +%endif DB 0x48,0x83,0xEC,0x08 + mov rax,-1 test rcx,rcx jz NEAR $L$enc_key_ret @@ -4581,7 +2499,9 @@ $L$enc_key_ret: pxor xmm4,xmm4 pxor xmm5,xmm5 add rsp,8 + DB 0F3h,0C3h ;repret + $L$SEH_end_set_encrypt_key: ALIGN 16 @@ -4766,64 +2686,6 @@ ctr_xts_se_handler: -ALIGN 16 -ocb_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$ocb_no_xmm - - mov rax,QWORD[152+r8] - - lea rsi,[rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[((160+40))+rax] - -$L$ocb_no_xmm: - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - - jmp NEAR $L$common_seh_tail - - ALIGN 16 cbc_se_handler: push rsi @@ -4908,46 +2770,22 @@ $L$common_seh_tail: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_aesni_ecb_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_ecb_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_hw_ecb_encrypt wrt ..imagebase + DD $L$SEH_end_aes_hw_ecb_encrypt wrt ..imagebase DD $L$SEH_info_ecb wrt ..imagebase - DD $L$SEH_begin_aesni_ccm64_encrypt_blocks wrt ..imagebase - DD $L$SEH_end_aesni_ccm64_encrypt_blocks wrt ..imagebase - DD $L$SEH_info_ccm64_enc wrt ..imagebase - - DD $L$SEH_begin_aesni_ccm64_decrypt_blocks wrt ..imagebase - DD $L$SEH_end_aesni_ccm64_decrypt_blocks wrt ..imagebase - DD $L$SEH_info_ccm64_dec wrt ..imagebase - - DD $L$SEH_begin_aesni_ctr32_encrypt_blocks wrt ..imagebase - DD $L$SEH_end_aesni_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_info_ctr32 wrt ..imagebase - - DD $L$SEH_begin_aesni_xts_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_xts_encrypt wrt ..imagebase - DD $L$SEH_info_xts_enc wrt ..imagebase - - DD $L$SEH_begin_aesni_xts_decrypt wrt ..imagebase - DD $L$SEH_end_aesni_xts_decrypt wrt ..imagebase - DD $L$SEH_info_xts_dec wrt ..imagebase - - DD $L$SEH_begin_aesni_ocb_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_ocb_encrypt wrt ..imagebase - DD $L$SEH_info_ocb_enc wrt ..imagebase - - DD $L$SEH_begin_aesni_ocb_decrypt wrt ..imagebase - DD $L$SEH_end_aesni_ocb_decrypt wrt ..imagebase - DD $L$SEH_info_ocb_dec wrt ..imagebase - DD $L$SEH_begin_aesni_cbc_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_hw_cbc_encrypt wrt ..imagebase + DD $L$SEH_end_aes_hw_cbc_encrypt wrt ..imagebase DD $L$SEH_info_cbc wrt ..imagebase - DD aesni_set_decrypt_key wrt ..imagebase + DD aes_hw_set_decrypt_key wrt ..imagebase DD $L$SEH_end_set_decrypt_key wrt ..imagebase DD $L$SEH_info_key wrt ..imagebase - DD aesni_set_encrypt_key wrt ..imagebase + DD aes_hw_set_encrypt_key wrt ..imagebase DD $L$SEH_end_set_encrypt_key wrt ..imagebase DD $L$SEH_info_key wrt ..imagebase section .xdata rdata align=8 @@ -4956,38 +2794,10 @@ $L$SEH_info_ecb: DB 9,0,0,0 DD ecb_ccm64_se_handler wrt ..imagebase DD $L$ecb_enc_body wrt ..imagebase,$L$ecb_enc_ret wrt ..imagebase -$L$SEH_info_ccm64_enc: -DB 9,0,0,0 - DD ecb_ccm64_se_handler wrt ..imagebase - DD $L$ccm64_enc_body wrt ..imagebase,$L$ccm64_enc_ret wrt ..imagebase -$L$SEH_info_ccm64_dec: -DB 9,0,0,0 - DD ecb_ccm64_se_handler wrt ..imagebase - DD $L$ccm64_dec_body wrt ..imagebase,$L$ccm64_dec_ret wrt ..imagebase $L$SEH_info_ctr32: DB 9,0,0,0 DD ctr_xts_se_handler wrt ..imagebase DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase -$L$SEH_info_xts_enc: -DB 9,0,0,0 - DD ctr_xts_se_handler wrt ..imagebase - DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase -$L$SEH_info_xts_dec: -DB 9,0,0,0 - DD ctr_xts_se_handler wrt ..imagebase - DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase -$L$SEH_info_ocb_enc: -DB 9,0,0,0 - DD ocb_se_handler wrt ..imagebase - DD $L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase - DD $L$ocb_enc_pop wrt ..imagebase - DD 0 -$L$SEH_info_ocb_dec: -DB 9,0,0,0 - DD ocb_se_handler wrt ..imagebase - DD $L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase - DD $L$ocb_dec_pop wrt ..imagebase - DD 0 $L$SEH_info_cbc: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm deleted file mode 100644 index 9c6d129369..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm +++ /dev/null @@ -1,2744 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - - -EXTERN asm_AES_encrypt -EXTERN asm_AES_decrypt - - -ALIGN 64 -_bsaes_encrypt8: - lea r11,[$L$BS0] - - movdqa xmm8,XMMWORD[rax] - lea rax,[16+rax] - movdqa xmm7,XMMWORD[80+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm3 - psrlq xmm3,1 - pxor xmm5,xmm6 - pxor xmm3,xmm4 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm6,xmm5 - psllq xmm5,1 - pxor xmm4,xmm3 - psllq xmm3,1 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm1,xmm2 - pxor xmm15,xmm0 - pand xmm1,xmm7 - pand xmm15,xmm7 - pxor xmm2,xmm1 - psllq xmm1,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm1,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm4 - psrlq xmm4,2 - movdqa xmm10,xmm3 - psrlq xmm3,2 - pxor xmm4,xmm6 - pxor xmm3,xmm5 - pand xmm4,xmm8 - pand xmm3,xmm8 - pxor xmm6,xmm4 - psllq xmm4,2 - pxor xmm5,xmm3 - psllq xmm3,2 - pxor xmm4,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm2 - pxor xmm15,xmm1 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm2,xmm0 - psllq xmm0,2 - pxor xmm1,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm2 - psrlq xmm2,4 - movdqa xmm10,xmm1 - psrlq xmm1,4 - pxor xmm2,xmm6 - pxor xmm1,xmm5 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm6,xmm2 - psllq xmm2,4 - pxor xmm5,xmm1 - psllq xmm1,4 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm4 - pxor xmm15,xmm3 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm4,xmm0 - psllq xmm0,4 - pxor xmm3,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - dec r10d - jmp NEAR $L$enc_sbox -ALIGN 16 -$L$enc_loop: - pxor xmm15,XMMWORD[rax] - pxor xmm0,XMMWORD[16+rax] - pxor xmm1,XMMWORD[32+rax] - pxor xmm2,XMMWORD[48+rax] -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,XMMWORD[64+rax] - pxor xmm4,XMMWORD[80+rax] -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,XMMWORD[96+rax] - pxor xmm6,XMMWORD[112+rax] -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea rax,[128+rax] -$L$enc_sbox: - pxor xmm4,xmm5 - pxor xmm1,xmm0 - pxor xmm2,xmm15 - pxor xmm5,xmm1 - pxor xmm4,xmm15 - - pxor xmm5,xmm2 - pxor xmm2,xmm6 - pxor xmm6,xmm4 - pxor xmm2,xmm3 - pxor xmm3,xmm4 - pxor xmm2,xmm0 - - pxor xmm1,xmm6 - pxor xmm0,xmm4 - movdqa xmm10,xmm6 - movdqa xmm9,xmm0 - movdqa xmm8,xmm4 - movdqa xmm12,xmm1 - movdqa xmm11,xmm5 - - pxor xmm10,xmm3 - pxor xmm9,xmm1 - pxor xmm8,xmm2 - movdqa xmm13,xmm10 - pxor xmm12,xmm3 - movdqa xmm7,xmm9 - pxor xmm11,xmm15 - movdqa xmm14,xmm10 - - por xmm9,xmm8 - por xmm10,xmm11 - pxor xmm14,xmm7 - pand xmm13,xmm11 - pxor xmm11,xmm8 - pand xmm7,xmm8 - pand xmm14,xmm11 - movdqa xmm11,xmm2 - pxor xmm11,xmm15 - pand xmm12,xmm11 - pxor xmm10,xmm12 - pxor xmm9,xmm12 - movdqa xmm12,xmm6 - movdqa xmm11,xmm4 - pxor xmm12,xmm0 - pxor xmm11,xmm5 - movdqa xmm8,xmm12 - pand xmm12,xmm11 - por xmm8,xmm11 - pxor xmm7,xmm12 - pxor xmm10,xmm14 - pxor xmm9,xmm13 - pxor xmm8,xmm14 - movdqa xmm11,xmm1 - pxor xmm7,xmm13 - movdqa xmm12,xmm3 - pxor xmm8,xmm13 - movdqa xmm13,xmm0 - pand xmm11,xmm2 - movdqa xmm14,xmm6 - pand xmm12,xmm15 - pand xmm13,xmm4 - por xmm14,xmm5 - pxor xmm10,xmm11 - pxor xmm9,xmm12 - pxor xmm8,xmm13 - pxor xmm7,xmm14 - - - - - - movdqa xmm11,xmm10 - pand xmm10,xmm8 - pxor xmm11,xmm9 - - movdqa xmm13,xmm7 - movdqa xmm14,xmm11 - pxor xmm13,xmm10 - pand xmm14,xmm13 - - movdqa xmm12,xmm8 - pxor xmm14,xmm9 - pxor xmm12,xmm7 - - pxor xmm10,xmm9 - - pand xmm12,xmm10 - - movdqa xmm9,xmm13 - pxor xmm12,xmm7 - - pxor xmm9,xmm12 - pxor xmm8,xmm12 - - pand xmm9,xmm7 - - pxor xmm13,xmm9 - pxor xmm8,xmm9 - - pand xmm13,xmm14 - - pxor xmm13,xmm11 - movdqa xmm11,xmm5 - movdqa xmm7,xmm4 - movdqa xmm9,xmm14 - pxor xmm9,xmm13 - pand xmm9,xmm5 - pxor xmm5,xmm4 - pand xmm4,xmm14 - pand xmm5,xmm13 - pxor xmm5,xmm4 - pxor xmm4,xmm9 - pxor xmm11,xmm15 - pxor xmm7,xmm2 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm15 - pxor xmm11,xmm7 - pxor xmm15,xmm2 - pand xmm7,xmm14 - pand xmm2,xmm12 - pand xmm11,xmm13 - pand xmm15,xmm8 - pxor xmm7,xmm11 - pxor xmm15,xmm2 - pxor xmm11,xmm10 - pxor xmm2,xmm9 - pxor xmm5,xmm11 - pxor xmm15,xmm11 - pxor xmm4,xmm7 - pxor xmm2,xmm7 - - movdqa xmm11,xmm6 - movdqa xmm7,xmm0 - pxor xmm11,xmm3 - pxor xmm7,xmm1 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm3 - pxor xmm11,xmm7 - pxor xmm3,xmm1 - pand xmm7,xmm14 - pand xmm1,xmm12 - pand xmm11,xmm13 - pand xmm3,xmm8 - pxor xmm7,xmm11 - pxor xmm3,xmm1 - pxor xmm11,xmm10 - pxor xmm1,xmm9 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - pxor xmm10,xmm13 - pand xmm10,xmm6 - pxor xmm6,xmm0 - pand xmm0,xmm14 - pand xmm6,xmm13 - pxor xmm6,xmm0 - pxor xmm0,xmm10 - pxor xmm6,xmm11 - pxor xmm3,xmm11 - pxor xmm0,xmm7 - pxor xmm1,xmm7 - pxor xmm6,xmm15 - pxor xmm0,xmm5 - pxor xmm3,xmm6 - pxor xmm5,xmm15 - pxor xmm15,xmm0 - - pxor xmm0,xmm4 - pxor xmm4,xmm1 - pxor xmm1,xmm2 - pxor xmm2,xmm4 - pxor xmm3,xmm4 - - pxor xmm5,xmm2 - dec r10d - jl NEAR $L$enc_done - pshufd xmm7,xmm15,0x93 - pshufd xmm8,xmm0,0x93 - pxor xmm15,xmm7 - pshufd xmm9,xmm3,0x93 - pxor xmm0,xmm8 - pshufd xmm10,xmm5,0x93 - pxor xmm3,xmm9 - pshufd xmm11,xmm2,0x93 - pxor xmm5,xmm10 - pshufd xmm12,xmm6,0x93 - pxor xmm2,xmm11 - pshufd xmm13,xmm1,0x93 - pxor xmm6,xmm12 - pshufd xmm14,xmm4,0x93 - pxor xmm1,xmm13 - pxor xmm4,xmm14 - - pxor xmm8,xmm15 - pxor xmm7,xmm4 - pxor xmm8,xmm4 - pshufd xmm15,xmm15,0x4E - pxor xmm9,xmm0 - pshufd xmm0,xmm0,0x4E - pxor xmm12,xmm2 - pxor xmm15,xmm7 - pxor xmm13,xmm6 - pxor xmm0,xmm8 - pxor xmm11,xmm5 - pshufd xmm7,xmm2,0x4E - pxor xmm14,xmm1 - pshufd xmm8,xmm6,0x4E - pxor xmm10,xmm3 - pshufd xmm2,xmm5,0x4E - pxor xmm10,xmm4 - pshufd xmm6,xmm4,0x4E - pxor xmm11,xmm4 - pshufd xmm5,xmm1,0x4E - pxor xmm7,xmm11 - pshufd xmm1,xmm3,0x4E - pxor xmm8,xmm12 - pxor xmm2,xmm10 - pxor xmm6,xmm14 - pxor xmm5,xmm13 - movdqa xmm3,xmm7 - pxor xmm1,xmm9 - movdqa xmm4,xmm8 - movdqa xmm7,XMMWORD[48+r11] - jnz NEAR $L$enc_loop - movdqa xmm7,XMMWORD[64+r11] - jmp NEAR $L$enc_loop -ALIGN 16 -$L$enc_done: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm2 - psrlq xmm2,1 - pxor xmm1,xmm4 - pxor xmm2,xmm6 - pand xmm1,xmm7 - pand xmm2,xmm7 - pxor xmm4,xmm1 - psllq xmm1,1 - pxor xmm6,xmm2 - psllq xmm2,1 - pxor xmm1,xmm9 - pxor xmm2,xmm10 - movdqa xmm9,xmm3 - psrlq xmm3,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm3,xmm5 - pxor xmm15,xmm0 - pand xmm3,xmm7 - pand xmm15,xmm7 - pxor xmm5,xmm3 - psllq xmm3,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm3,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm6 - psrlq xmm6,2 - movdqa xmm10,xmm2 - psrlq xmm2,2 - pxor xmm6,xmm4 - pxor xmm2,xmm1 - pand xmm6,xmm8 - pand xmm2,xmm8 - pxor xmm4,xmm6 - psllq xmm6,2 - pxor xmm1,xmm2 - psllq xmm2,2 - pxor xmm6,xmm9 - pxor xmm2,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm5 - pxor xmm15,xmm3 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm5,xmm0 - psllq xmm0,2 - pxor xmm3,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm5 - psrlq xmm5,4 - movdqa xmm10,xmm3 - psrlq xmm3,4 - pxor xmm5,xmm4 - pxor xmm3,xmm1 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm4,xmm5 - psllq xmm5,4 - pxor xmm1,xmm3 - psllq xmm3,4 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm6 - pxor xmm15,xmm2 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm6,xmm0 - psllq xmm0,4 - pxor xmm2,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[rax] - pxor xmm3,xmm7 - pxor xmm5,xmm7 - pxor xmm2,xmm7 - pxor xmm6,xmm7 - pxor xmm1,xmm7 - pxor xmm4,xmm7 - pxor xmm15,xmm7 - pxor xmm0,xmm7 - DB 0F3h,0C3h ;repret - - - -ALIGN 64 -_bsaes_decrypt8: - lea r11,[$L$BS0] - - movdqa xmm8,XMMWORD[rax] - lea rax,[16+rax] - movdqa xmm7,XMMWORD[((-48))+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm3 - psrlq xmm3,1 - pxor xmm5,xmm6 - pxor xmm3,xmm4 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm6,xmm5 - psllq xmm5,1 - pxor xmm4,xmm3 - psllq xmm3,1 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm1,xmm2 - pxor xmm15,xmm0 - pand xmm1,xmm7 - pand xmm15,xmm7 - pxor xmm2,xmm1 - psllq xmm1,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm1,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm4 - psrlq xmm4,2 - movdqa xmm10,xmm3 - psrlq xmm3,2 - pxor xmm4,xmm6 - pxor xmm3,xmm5 - pand xmm4,xmm8 - pand xmm3,xmm8 - pxor xmm6,xmm4 - psllq xmm4,2 - pxor xmm5,xmm3 - psllq xmm3,2 - pxor xmm4,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm2 - pxor xmm15,xmm1 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm2,xmm0 - psllq xmm0,2 - pxor xmm1,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm2 - psrlq xmm2,4 - movdqa xmm10,xmm1 - psrlq xmm1,4 - pxor xmm2,xmm6 - pxor xmm1,xmm5 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm6,xmm2 - psllq xmm2,4 - pxor xmm5,xmm1 - psllq xmm1,4 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm4 - pxor xmm15,xmm3 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm4,xmm0 - psllq xmm0,4 - pxor xmm3,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - dec r10d - jmp NEAR $L$dec_sbox -ALIGN 16 -$L$dec_loop: - pxor xmm15,XMMWORD[rax] - pxor xmm0,XMMWORD[16+rax] - pxor xmm1,XMMWORD[32+rax] - pxor xmm2,XMMWORD[48+rax] -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,XMMWORD[64+rax] - pxor xmm4,XMMWORD[80+rax] -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,XMMWORD[96+rax] - pxor xmm6,XMMWORD[112+rax] -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea rax,[128+rax] -$L$dec_sbox: - pxor xmm2,xmm3 - - pxor xmm3,xmm6 - pxor xmm1,xmm6 - pxor xmm5,xmm3 - pxor xmm6,xmm5 - pxor xmm0,xmm6 - - pxor xmm15,xmm0 - pxor xmm1,xmm4 - pxor xmm2,xmm15 - pxor xmm4,xmm15 - pxor xmm0,xmm2 - movdqa xmm10,xmm2 - movdqa xmm9,xmm6 - movdqa xmm8,xmm0 - movdqa xmm12,xmm3 - movdqa xmm11,xmm4 - - pxor xmm10,xmm15 - pxor xmm9,xmm3 - pxor xmm8,xmm5 - movdqa xmm13,xmm10 - pxor xmm12,xmm15 - movdqa xmm7,xmm9 - pxor xmm11,xmm1 - movdqa xmm14,xmm10 - - por xmm9,xmm8 - por xmm10,xmm11 - pxor xmm14,xmm7 - pand xmm13,xmm11 - pxor xmm11,xmm8 - pand xmm7,xmm8 - pand xmm14,xmm11 - movdqa xmm11,xmm5 - pxor xmm11,xmm1 - pand xmm12,xmm11 - pxor xmm10,xmm12 - pxor xmm9,xmm12 - movdqa xmm12,xmm2 - movdqa xmm11,xmm0 - pxor xmm12,xmm6 - pxor xmm11,xmm4 - movdqa xmm8,xmm12 - pand xmm12,xmm11 - por xmm8,xmm11 - pxor xmm7,xmm12 - pxor xmm10,xmm14 - pxor xmm9,xmm13 - pxor xmm8,xmm14 - movdqa xmm11,xmm3 - pxor xmm7,xmm13 - movdqa xmm12,xmm15 - pxor xmm8,xmm13 - movdqa xmm13,xmm6 - pand xmm11,xmm5 - movdqa xmm14,xmm2 - pand xmm12,xmm1 - pand xmm13,xmm0 - por xmm14,xmm4 - pxor xmm10,xmm11 - pxor xmm9,xmm12 - pxor xmm8,xmm13 - pxor xmm7,xmm14 - - - - - - movdqa xmm11,xmm10 - pand xmm10,xmm8 - pxor xmm11,xmm9 - - movdqa xmm13,xmm7 - movdqa xmm14,xmm11 - pxor xmm13,xmm10 - pand xmm14,xmm13 - - movdqa xmm12,xmm8 - pxor xmm14,xmm9 - pxor xmm12,xmm7 - - pxor xmm10,xmm9 - - pand xmm12,xmm10 - - movdqa xmm9,xmm13 - pxor xmm12,xmm7 - - pxor xmm9,xmm12 - pxor xmm8,xmm12 - - pand xmm9,xmm7 - - pxor xmm13,xmm9 - pxor xmm8,xmm9 - - pand xmm13,xmm14 - - pxor xmm13,xmm11 - movdqa xmm11,xmm4 - movdqa xmm7,xmm0 - movdqa xmm9,xmm14 - pxor xmm9,xmm13 - pand xmm9,xmm4 - pxor xmm4,xmm0 - pand xmm0,xmm14 - pand xmm4,xmm13 - pxor xmm4,xmm0 - pxor xmm0,xmm9 - pxor xmm11,xmm1 - pxor xmm7,xmm5 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm1 - pxor xmm11,xmm7 - pxor xmm1,xmm5 - pand xmm7,xmm14 - pand xmm5,xmm12 - pand xmm11,xmm13 - pand xmm1,xmm8 - pxor xmm7,xmm11 - pxor xmm1,xmm5 - pxor xmm11,xmm10 - pxor xmm5,xmm9 - pxor xmm4,xmm11 - pxor xmm1,xmm11 - pxor xmm0,xmm7 - pxor xmm5,xmm7 - - movdqa xmm11,xmm2 - movdqa xmm7,xmm6 - pxor xmm11,xmm15 - pxor xmm7,xmm3 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm15 - pxor xmm11,xmm7 - pxor xmm15,xmm3 - pand xmm7,xmm14 - pand xmm3,xmm12 - pand xmm11,xmm13 - pand xmm15,xmm8 - pxor xmm7,xmm11 - pxor xmm15,xmm3 - pxor xmm11,xmm10 - pxor xmm3,xmm9 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - pxor xmm10,xmm13 - pand xmm10,xmm2 - pxor xmm2,xmm6 - pand xmm6,xmm14 - pand xmm2,xmm13 - pxor xmm2,xmm6 - pxor xmm6,xmm10 - pxor xmm2,xmm11 - pxor xmm15,xmm11 - pxor xmm6,xmm7 - pxor xmm3,xmm7 - pxor xmm0,xmm6 - pxor xmm5,xmm4 - - pxor xmm3,xmm0 - pxor xmm1,xmm6 - pxor xmm4,xmm6 - pxor xmm3,xmm1 - pxor xmm6,xmm15 - pxor xmm3,xmm4 - pxor xmm2,xmm5 - pxor xmm5,xmm0 - pxor xmm2,xmm3 - - pxor xmm3,xmm15 - pxor xmm6,xmm2 - dec r10d - jl NEAR $L$dec_done - - pshufd xmm7,xmm15,0x4E - pshufd xmm13,xmm2,0x4E - pxor xmm7,xmm15 - pshufd xmm14,xmm4,0x4E - pxor xmm13,xmm2 - pshufd xmm8,xmm0,0x4E - pxor xmm14,xmm4 - pshufd xmm9,xmm5,0x4E - pxor xmm8,xmm0 - pshufd xmm10,xmm3,0x4E - pxor xmm9,xmm5 - pxor xmm15,xmm13 - pxor xmm0,xmm13 - pshufd xmm11,xmm1,0x4E - pxor xmm10,xmm3 - pxor xmm5,xmm7 - pxor xmm3,xmm8 - pshufd xmm12,xmm6,0x4E - pxor xmm11,xmm1 - pxor xmm0,xmm14 - pxor xmm1,xmm9 - pxor xmm12,xmm6 - - pxor xmm5,xmm14 - pxor xmm3,xmm13 - pxor xmm1,xmm13 - pxor xmm6,xmm10 - pxor xmm2,xmm11 - pxor xmm1,xmm14 - pxor xmm6,xmm14 - pxor xmm4,xmm12 - pshufd xmm7,xmm15,0x93 - pshufd xmm8,xmm0,0x93 - pxor xmm15,xmm7 - pshufd xmm9,xmm5,0x93 - pxor xmm0,xmm8 - pshufd xmm10,xmm3,0x93 - pxor xmm5,xmm9 - pshufd xmm11,xmm1,0x93 - pxor xmm3,xmm10 - pshufd xmm12,xmm6,0x93 - pxor xmm1,xmm11 - pshufd xmm13,xmm2,0x93 - pxor xmm6,xmm12 - pshufd xmm14,xmm4,0x93 - pxor xmm2,xmm13 - pxor xmm4,xmm14 - - pxor xmm8,xmm15 - pxor xmm7,xmm4 - pxor xmm8,xmm4 - pshufd xmm15,xmm15,0x4E - pxor xmm9,xmm0 - pshufd xmm0,xmm0,0x4E - pxor xmm12,xmm1 - pxor xmm15,xmm7 - pxor xmm13,xmm6 - pxor xmm0,xmm8 - pxor xmm11,xmm3 - pshufd xmm7,xmm1,0x4E - pxor xmm14,xmm2 - pshufd xmm8,xmm6,0x4E - pxor xmm10,xmm5 - pshufd xmm1,xmm3,0x4E - pxor xmm10,xmm4 - pshufd xmm6,xmm4,0x4E - pxor xmm11,xmm4 - pshufd xmm3,xmm2,0x4E - pxor xmm7,xmm11 - pshufd xmm2,xmm5,0x4E - pxor xmm8,xmm12 - pxor xmm10,xmm1 - pxor xmm6,xmm14 - pxor xmm13,xmm3 - movdqa xmm3,xmm7 - pxor xmm2,xmm9 - movdqa xmm5,xmm13 - movdqa xmm4,xmm8 - movdqa xmm1,xmm2 - movdqa xmm2,xmm10 - movdqa xmm7,XMMWORD[((-16))+r11] - jnz NEAR $L$dec_loop - movdqa xmm7,XMMWORD[((-32))+r11] - jmp NEAR $L$dec_loop -ALIGN 16 -$L$dec_done: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm2 - psrlq xmm2,1 - movdqa xmm10,xmm1 - psrlq xmm1,1 - pxor xmm2,xmm4 - pxor xmm1,xmm6 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm4,xmm2 - psllq xmm2,1 - pxor xmm6,xmm1 - psllq xmm1,1 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm5,xmm3 - pxor xmm15,xmm0 - pand xmm5,xmm7 - pand xmm15,xmm7 - pxor xmm3,xmm5 - psllq xmm5,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm5,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm6 - psrlq xmm6,2 - movdqa xmm10,xmm1 - psrlq xmm1,2 - pxor xmm6,xmm4 - pxor xmm1,xmm2 - pand xmm6,xmm8 - pand xmm1,xmm8 - pxor xmm4,xmm6 - psllq xmm6,2 - pxor xmm2,xmm1 - psllq xmm1,2 - pxor xmm6,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm3 - pxor xmm15,xmm5 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm3,xmm0 - psllq xmm0,2 - pxor xmm5,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm3 - psrlq xmm3,4 - movdqa xmm10,xmm5 - psrlq xmm5,4 - pxor xmm3,xmm4 - pxor xmm5,xmm2 - pand xmm3,xmm7 - pand xmm5,xmm7 - pxor xmm4,xmm3 - psllq xmm3,4 - pxor xmm2,xmm5 - psllq xmm5,4 - pxor xmm3,xmm9 - pxor xmm5,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm6 - pxor xmm15,xmm1 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm6,xmm0 - psllq xmm0,4 - pxor xmm1,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[rax] - pxor xmm5,xmm7 - pxor xmm3,xmm7 - pxor xmm1,xmm7 - pxor xmm6,xmm7 - pxor xmm2,xmm7 - pxor xmm4,xmm7 - pxor xmm15,xmm7 - pxor xmm0,xmm7 - DB 0F3h,0C3h ;repret - - -ALIGN 16 -_bsaes_key_convert: - lea r11,[$L$masks] - movdqu xmm7,XMMWORD[rcx] - lea rcx,[16+rcx] - movdqa xmm0,XMMWORD[r11] - movdqa xmm1,XMMWORD[16+r11] - movdqa xmm2,XMMWORD[32+r11] - movdqa xmm3,XMMWORD[48+r11] - movdqa xmm4,XMMWORD[64+r11] - pcmpeqd xmm5,xmm5 - - movdqu xmm6,XMMWORD[rcx] - movdqa XMMWORD[rax],xmm7 - lea rax,[16+rax] - dec r10d - jmp NEAR $L$key_loop -ALIGN 16 -$L$key_loop: -DB 102,15,56,0,244 - - movdqa xmm8,xmm0 - movdqa xmm9,xmm1 - - pand xmm8,xmm6 - pand xmm9,xmm6 - movdqa xmm10,xmm2 - pcmpeqb xmm8,xmm0 - psllq xmm0,4 - movdqa xmm11,xmm3 - pcmpeqb xmm9,xmm1 - psllq xmm1,4 - - pand xmm10,xmm6 - pand xmm11,xmm6 - movdqa xmm12,xmm0 - pcmpeqb xmm10,xmm2 - psllq xmm2,4 - movdqa xmm13,xmm1 - pcmpeqb xmm11,xmm3 - psllq xmm3,4 - - movdqa xmm14,xmm2 - movdqa xmm15,xmm3 - pxor xmm8,xmm5 - pxor xmm9,xmm5 - - pand xmm12,xmm6 - pand xmm13,xmm6 - movdqa XMMWORD[rax],xmm8 - pcmpeqb xmm12,xmm0 - psrlq xmm0,4 - movdqa XMMWORD[16+rax],xmm9 - pcmpeqb xmm13,xmm1 - psrlq xmm1,4 - lea rcx,[16+rcx] - - pand xmm14,xmm6 - pand xmm15,xmm6 - movdqa XMMWORD[32+rax],xmm10 - pcmpeqb xmm14,xmm2 - psrlq xmm2,4 - movdqa XMMWORD[48+rax],xmm11 - pcmpeqb xmm15,xmm3 - psrlq xmm3,4 - movdqu xmm6,XMMWORD[rcx] - - pxor xmm13,xmm5 - pxor xmm14,xmm5 - movdqa XMMWORD[64+rax],xmm12 - movdqa XMMWORD[80+rax],xmm13 - movdqa XMMWORD[96+rax],xmm14 - movdqa XMMWORD[112+rax],xmm15 - lea rax,[128+rax] - dec r10d - jnz NEAR $L$key_loop - - movdqa xmm7,XMMWORD[80+r11] - - DB 0F3h,0C3h ;repret - -EXTERN asm_AES_cbc_encrypt -global bsaes_cbc_encrypt - -ALIGN 16 -bsaes_cbc_encrypt: - mov r11d,DWORD[48+rsp] - cmp r11d,0 - jne NEAR asm_AES_cbc_encrypt - cmp r8,128 - jb NEAR asm_AES_cbc_encrypt - - mov rax,rsp -$L$cbc_dec_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$cbc_dec_body: - mov rbp,rsp - mov eax,DWORD[240+r9] - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - mov rbx,r10 - shr r14,4 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,XMMWORD[rsp] - movdqa XMMWORD[rax],xmm6 - movdqa XMMWORD[rsp],xmm7 - - movdqu xmm14,XMMWORD[rbx] - sub r14,8 -$L$cbc_dec_loop: - movdqu xmm15,XMMWORD[r12] - movdqu xmm0,XMMWORD[16+r12] - movdqu xmm1,XMMWORD[32+r12] - movdqu xmm2,XMMWORD[48+r12] - movdqu xmm3,XMMWORD[64+r12] - movdqu xmm4,XMMWORD[80+r12] - mov rax,rsp - movdqu xmm5,XMMWORD[96+r12] - mov r10d,edx - movdqu xmm6,XMMWORD[112+r12] - movdqa XMMWORD[32+rbp],xmm14 - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm2,xmm12 - movdqu xmm14,XMMWORD[112+r12] - pxor xmm4,xmm13 - movdqu XMMWORD[r13],xmm15 - lea r12,[128+r12] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - sub r14,8 - jnc NEAR $L$cbc_dec_loop - - add r14,8 - jz NEAR $L$cbc_dec_done - - movdqu xmm15,XMMWORD[r12] - mov rax,rsp - mov r10d,edx - cmp r14,2 - jb NEAR $L$cbc_dec_one - movdqu xmm0,XMMWORD[16+r12] - je NEAR $L$cbc_dec_two - movdqu xmm1,XMMWORD[32+r12] - cmp r14,4 - jb NEAR $L$cbc_dec_three - movdqu xmm2,XMMWORD[48+r12] - je NEAR $L$cbc_dec_four - movdqu xmm3,XMMWORD[64+r12] - cmp r14,6 - jb NEAR $L$cbc_dec_five - movdqu xmm4,XMMWORD[80+r12] - je NEAR $L$cbc_dec_six - movdqu xmm5,XMMWORD[96+r12] - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu xmm14,XMMWORD[96+r12] - pxor xmm2,xmm12 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_six: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm14,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_five: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm14,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_four: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm14,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_three: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm14,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_two: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm14,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_one: - lea rcx,[r12] - lea rdx,[32+rbp] - lea r8,[r15] - call asm_AES_decrypt - pxor xmm14,XMMWORD[32+rbp] - movdqu XMMWORD[r13],xmm14 - movdqa xmm14,xmm15 - -$L$cbc_dec_done: - movdqu XMMWORD[rbx],xmm14 - lea rax,[rsp] - pxor xmm0,xmm0 -$L$cbc_dec_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$cbc_dec_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$cbc_dec_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$cbc_dec_epilogue: - DB 0F3h,0C3h ;repret - - -global bsaes_ctr32_encrypt_blocks - -ALIGN 16 -bsaes_ctr32_encrypt_blocks: - mov rax,rsp -$L$ctr_enc_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$ctr_enc_body: - mov rbp,rsp - movdqu xmm0,XMMWORD[r10] - mov eax,DWORD[240+r9] - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - movdqa XMMWORD[32+rbp],xmm0 - cmp r8,8 - jb NEAR $L$ctr_enc_short - - mov ebx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,ebx - call _bsaes_key_convert - pxor xmm7,xmm6 - movdqa XMMWORD[rax],xmm7 - - movdqa xmm8,XMMWORD[rsp] - lea r11,[$L$ADD1] - movdqa xmm15,XMMWORD[32+rbp] - movdqa xmm7,XMMWORD[((-32))+r11] -DB 102,68,15,56,0,199 -DB 102,68,15,56,0,255 - movdqa XMMWORD[rsp],xmm8 - jmp NEAR $L$ctr_enc_loop -ALIGN 16 -$L$ctr_enc_loop: - movdqa XMMWORD[32+rbp],xmm15 - movdqa xmm0,xmm15 - movdqa xmm1,xmm15 - paddd xmm0,XMMWORD[r11] - movdqa xmm2,xmm15 - paddd xmm1,XMMWORD[16+r11] - movdqa xmm3,xmm15 - paddd xmm2,XMMWORD[32+r11] - movdqa xmm4,xmm15 - paddd xmm3,XMMWORD[48+r11] - movdqa xmm5,xmm15 - paddd xmm4,XMMWORD[64+r11] - movdqa xmm6,xmm15 - paddd xmm5,XMMWORD[80+r11] - paddd xmm6,XMMWORD[96+r11] - - - - movdqa xmm8,XMMWORD[rsp] - lea rax,[16+rsp] - movdqa xmm7,XMMWORD[((-16))+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea r11,[$L$BS0] - mov r10d,ebx - - call _bsaes_encrypt8_bitslice - - sub r14,8 - jc NEAR $L$ctr_enc_loop_done - - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - movdqu xmm9,XMMWORD[32+r12] - movdqu xmm10,XMMWORD[48+r12] - movdqu xmm11,XMMWORD[64+r12] - movdqu xmm12,XMMWORD[80+r12] - movdqu xmm13,XMMWORD[96+r12] - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - pxor xmm7,xmm15 - movdqa xmm15,XMMWORD[32+rbp] - pxor xmm0,xmm8 - movdqu XMMWORD[r13],xmm7 - pxor xmm3,xmm9 - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,xmm10 - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,xmm11 - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,xmm12 - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,xmm13 - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,xmm14 - movdqu XMMWORD[96+r13],xmm1 - lea r11,[$L$ADD1] - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - paddd xmm15,XMMWORD[112+r11] - jnz NEAR $L$ctr_enc_loop - - jmp NEAR $L$ctr_enc_done -ALIGN 16 -$L$ctr_enc_loop_done: - add r14,8 - movdqu xmm7,XMMWORD[r12] - pxor xmm15,xmm7 - movdqu XMMWORD[r13],xmm15 - cmp r14,2 - jb NEAR $L$ctr_enc_done - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm8 - movdqu XMMWORD[16+r13],xmm0 - je NEAR $L$ctr_enc_done - movdqu xmm9,XMMWORD[32+r12] - pxor xmm3,xmm9 - movdqu XMMWORD[32+r13],xmm3 - cmp r14,4 - jb NEAR $L$ctr_enc_done - movdqu xmm10,XMMWORD[48+r12] - pxor xmm5,xmm10 - movdqu XMMWORD[48+r13],xmm5 - je NEAR $L$ctr_enc_done - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm11 - movdqu XMMWORD[64+r13],xmm2 - cmp r14,6 - jb NEAR $L$ctr_enc_done - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm12 - movdqu XMMWORD[80+r13],xmm6 - je NEAR $L$ctr_enc_done - movdqu xmm13,XMMWORD[96+r12] - pxor xmm1,xmm13 - movdqu XMMWORD[96+r13],xmm1 - jmp NEAR $L$ctr_enc_done - -ALIGN 16 -$L$ctr_enc_short: - lea rcx,[32+rbp] - lea rdx,[48+rbp] - lea r8,[r15] - call asm_AES_encrypt - movdqu xmm0,XMMWORD[r12] - lea r12,[16+r12] - mov eax,DWORD[44+rbp] - bswap eax - pxor xmm0,XMMWORD[48+rbp] - inc eax - movdqu XMMWORD[r13],xmm0 - bswap eax - lea r13,[16+r13] - mov DWORD[44+rsp],eax - dec r14 - jnz NEAR $L$ctr_enc_short - -$L$ctr_enc_done: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$ctr_enc_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$ctr_enc_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$ctr_enc_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$ctr_enc_epilogue: - DB 0F3h,0C3h ;repret - -global bsaes_xts_encrypt - -ALIGN 16 -bsaes_xts_encrypt: - mov rax,rsp -$L$xts_enc_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - mov r11,QWORD[168+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$xts_enc_body: - mov rbp,rsp - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - - lea rcx,[r11] - lea rdx,[32+rbp] - lea r8,[r10] - call asm_AES_encrypt - - mov eax,DWORD[240+r15] - mov rbx,r14 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,xmm6 - movdqa XMMWORD[rax],xmm7 - - and r14,-16 - sub rsp,0x80 - movdqa xmm6,XMMWORD[32+rbp] - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - - sub r14,0x80 - jc NEAR $L$xts_enc_short - jmp NEAR $L$xts_enc_loop - -ALIGN 16 -$L$xts_enc_loop: - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - movdqa XMMWORD[112+rsp],xmm6 - pxor xmm5,xmm13 - lea rax,[128+rsp] - pxor xmm6,xmm14 - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,XMMWORD[112+rsp] - movdqu XMMWORD[96+r13],xmm1 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - - movdqa xmm6,XMMWORD[112+rsp] - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - - sub r14,0x80 - jnc NEAR $L$xts_enc_loop - -$L$xts_enc_short: - add r14,0x80 - jz NEAR $L$xts_enc_done - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - cmp r14,16 - je NEAR $L$xts_enc_1 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - cmp r14,32 - je NEAR $L$xts_enc_2 - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - cmp r14,48 - je NEAR $L$xts_enc_3 - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - cmp r14,64 - je NEAR $L$xts_enc_4 - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - cmp r14,80 - je NEAR $L$xts_enc_5 - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - cmp r14,96 - je NEAR $L$xts_enc_6 - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqa XMMWORD[112+rsp],xmm6 - lea r12,[112+r12] - pxor xmm5,xmm13 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm1 - lea r13,[112+r13] - - movdqa xmm6,XMMWORD[112+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_6: - pxor xmm3,xmm11 - lea r12,[96+r12] - pxor xmm4,xmm12 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - movdqu XMMWORD[80+r13],xmm6 - lea r13,[96+r13] - - movdqa xmm6,XMMWORD[96+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_5: - pxor xmm2,xmm10 - lea r12,[80+r12] - pxor xmm3,xmm11 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - movdqu XMMWORD[64+r13],xmm2 - lea r13,[80+r13] - - movdqa xmm6,XMMWORD[80+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_4: - pxor xmm1,xmm9 - lea r12,[64+r12] - pxor xmm2,xmm10 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - movdqu XMMWORD[48+r13],xmm5 - lea r13,[64+r13] - - movdqa xmm6,XMMWORD[64+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_3: - pxor xmm0,xmm8 - lea r12,[48+r12] - pxor xmm1,xmm9 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm3 - lea r13,[48+r13] - - movdqa xmm6,XMMWORD[48+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_2: - pxor xmm15,xmm7 - lea r12,[32+r12] - pxor xmm0,xmm8 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - lea r13,[32+r13] - - movdqa xmm6,XMMWORD[32+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_1: - pxor xmm7,xmm15 - lea r12,[16+r12] - movdqa XMMWORD[32+rbp],xmm7 - lea rcx,[32+rbp] - lea rdx,[32+rbp] - lea r8,[r15] - call asm_AES_encrypt - pxor xmm15,XMMWORD[32+rbp] - - - - - - movdqu XMMWORD[r13],xmm15 - lea r13,[16+r13] - - movdqa xmm6,XMMWORD[16+rsp] - -$L$xts_enc_done: - and ebx,15 - jz NEAR $L$xts_enc_ret - mov rdx,r13 - -$L$xts_enc_steal: - movzx eax,BYTE[r12] - movzx ecx,BYTE[((-16))+rdx] - lea r12,[1+r12] - mov BYTE[((-16))+rdx],al - mov BYTE[rdx],cl - lea rdx,[1+rdx] - sub ebx,1 - jnz NEAR $L$xts_enc_steal - - movdqu xmm15,XMMWORD[((-16))+r13] - lea rcx,[32+rbp] - pxor xmm15,xmm6 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call asm_AES_encrypt - pxor xmm6,XMMWORD[32+rbp] - movdqu XMMWORD[(-16)+r13],xmm6 - -$L$xts_enc_ret: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$xts_enc_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$xts_enc_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$xts_enc_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$xts_enc_epilogue: - DB 0F3h,0C3h ;repret - - -global bsaes_xts_decrypt - -ALIGN 16 -bsaes_xts_decrypt: - mov rax,rsp -$L$xts_dec_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - mov r11,QWORD[168+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$xts_dec_body: - mov rbp,rsp - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - - lea rcx,[r11] - lea rdx,[32+rbp] - lea r8,[r10] - call asm_AES_encrypt - - mov eax,DWORD[240+r15] - mov rbx,r14 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,XMMWORD[rsp] - movdqa XMMWORD[rax],xmm6 - movdqa XMMWORD[rsp],xmm7 - - xor eax,eax - and r14,-16 - test ebx,15 - setnz al - shl rax,4 - sub r14,rax - - sub rsp,0x80 - movdqa xmm6,XMMWORD[32+rbp] - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - - sub r14,0x80 - jc NEAR $L$xts_dec_short - jmp NEAR $L$xts_dec_loop - -ALIGN 16 -$L$xts_dec_loop: - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - movdqa XMMWORD[112+rsp],xmm6 - pxor xmm5,xmm13 - lea rax,[128+rsp] - pxor xmm6,xmm14 - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - pxor xmm2,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,XMMWORD[112+rsp] - movdqu XMMWORD[96+r13],xmm2 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - - movdqa xmm6,XMMWORD[112+rsp] - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - - sub r14,0x80 - jnc NEAR $L$xts_dec_loop - -$L$xts_dec_short: - add r14,0x80 - jz NEAR $L$xts_dec_done - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - cmp r14,16 - je NEAR $L$xts_dec_1 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - cmp r14,32 - je NEAR $L$xts_dec_2 - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - cmp r14,48 - je NEAR $L$xts_dec_3 - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - cmp r14,64 - je NEAR $L$xts_dec_4 - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - cmp r14,80 - je NEAR $L$xts_dec_5 - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - cmp r14,96 - je NEAR $L$xts_dec_6 - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqa XMMWORD[112+rsp],xmm6 - lea r12,[112+r12] - pxor xmm5,xmm13 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - pxor xmm2,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - lea r13,[112+r13] - - movdqa xmm6,XMMWORD[112+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_6: - pxor xmm3,xmm11 - lea r12,[96+r12] - pxor xmm4,xmm12 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - lea r13,[96+r13] - - movdqa xmm6,XMMWORD[96+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_5: - pxor xmm2,xmm10 - lea r12,[80+r12] - pxor xmm3,xmm11 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - lea r13,[80+r13] - - movdqa xmm6,XMMWORD[80+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_4: - pxor xmm1,xmm9 - lea r12,[64+r12] - pxor xmm2,xmm10 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - lea r13,[64+r13] - - movdqa xmm6,XMMWORD[64+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_3: - pxor xmm0,xmm8 - lea r12,[48+r12] - pxor xmm1,xmm9 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - lea r13,[48+r13] - - movdqa xmm6,XMMWORD[48+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_2: - pxor xmm15,xmm7 - lea r12,[32+r12] - pxor xmm0,xmm8 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - lea r13,[32+r13] - - movdqa xmm6,XMMWORD[32+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_1: - pxor xmm7,xmm15 - lea r12,[16+r12] - movdqa XMMWORD[32+rbp],xmm7 - lea rcx,[32+rbp] - lea rdx,[32+rbp] - lea r8,[r15] - call asm_AES_decrypt - pxor xmm15,XMMWORD[32+rbp] - - - - - - movdqu XMMWORD[r13],xmm15 - lea r13,[16+r13] - - movdqa xmm6,XMMWORD[16+rsp] - -$L$xts_dec_done: - and ebx,15 - jz NEAR $L$xts_dec_ret - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - movdqa xmm5,xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - movdqu xmm15,XMMWORD[r12] - pxor xmm6,xmm13 - - lea rcx,[32+rbp] - pxor xmm15,xmm6 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call asm_AES_decrypt - pxor xmm6,XMMWORD[32+rbp] - mov rdx,r13 - movdqu XMMWORD[r13],xmm6 - -$L$xts_dec_steal: - movzx eax,BYTE[16+r12] - movzx ecx,BYTE[rdx] - lea r12,[1+r12] - mov BYTE[rdx],al - mov BYTE[16+rdx],cl - lea rdx,[1+rdx] - sub ebx,1 - jnz NEAR $L$xts_dec_steal - - movdqu xmm15,XMMWORD[r13] - lea rcx,[32+rbp] - pxor xmm15,xmm5 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call asm_AES_decrypt - pxor xmm5,XMMWORD[32+rbp] - movdqu XMMWORD[r13],xmm5 - -$L$xts_dec_ret: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$xts_dec_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$xts_dec_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$xts_dec_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$xts_dec_epilogue: - DB 0F3h,0C3h ;repret - - -ALIGN 64 -_bsaes_const: -$L$M0ISR: - DQ 0x0a0e0206070b0f03,0x0004080c0d010509 -$L$ISRM0: - DQ 0x01040b0e0205080f,0x0306090c00070a0d -$L$ISR: - DQ 0x0504070602010003,0x0f0e0d0c080b0a09 -$L$BS0: - DQ 0x5555555555555555,0x5555555555555555 -$L$BS1: - DQ 0x3333333333333333,0x3333333333333333 -$L$BS2: - DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f -$L$SR: - DQ 0x0504070600030201,0x0f0e0d0c0a09080b -$L$SRM0: - DQ 0x0304090e00050a0f,0x01060b0c0207080d -$L$M0SR: - DQ 0x0a0e02060f03070b,0x0004080c05090d01 -$L$SWPUP: - DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 -$L$SWPUPM0SR: - DQ 0x0a0d02060c03070b,0x0004080f05090e01 -$L$ADD1: - DQ 0x0000000000000000,0x0000000100000000 -$L$ADD2: - DQ 0x0000000000000000,0x0000000200000000 -$L$ADD3: - DQ 0x0000000000000000,0x0000000300000000 -$L$ADD4: - DQ 0x0000000000000000,0x0000000400000000 -$L$ADD5: - DQ 0x0000000000000000,0x0000000500000000 -$L$ADD6: - DQ 0x0000000000000000,0x0000000600000000 -$L$ADD7: - DQ 0x0000000000000000,0x0000000700000000 -$L$ADD8: - DQ 0x0000000000000000,0x0000000800000000 -$L$xts_magic: - DD 0x87,0,1,0 -$L$masks: - DQ 0x0101010101010101,0x0101010101010101 - DQ 0x0202020202020202,0x0202020202020202 - DQ 0x0404040404040404,0x0404040404040404 - DQ 0x0808080808080808,0x0808080808080808 -$L$M0: - DQ 0x02060a0e03070b0f,0x0004080c0105090d -$L$63: - DQ 0x6363636363636363,0x6363636363636363 -DB 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102 -DB 111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44 -DB 32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44 -DB 32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32 -DB 65,110,100,121,32,80,111,108,121,97,107,111,118,0 -ALIGN 64 - -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jbe NEAR $L$in_prologue - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_tail - - mov rax,QWORD[160+r8] - - lea rsi,[64+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[((160+120))+rax] - -$L$in_tail: - mov rbp,QWORD[((-48))+rax] - mov rbx,QWORD[((-40))+rax] - mov r12,QWORD[((-32))+rax] - mov r13,QWORD[((-24))+rax] - mov r14,QWORD[((-16))+rax] - mov r15,QWORD[((-8))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$in_prologue: - mov QWORD[152+r8],rax - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$cbc_dec_prologue wrt ..imagebase - DD $L$cbc_dec_epilogue wrt ..imagebase - DD $L$cbc_dec_info wrt ..imagebase - - DD $L$ctr_enc_prologue wrt ..imagebase - DD $L$ctr_enc_epilogue wrt ..imagebase - DD $L$ctr_enc_info wrt ..imagebase - - DD $L$xts_enc_prologue wrt ..imagebase - DD $L$xts_enc_epilogue wrt ..imagebase - DD $L$xts_enc_info wrt ..imagebase - - DD $L$xts_dec_prologue wrt ..imagebase - DD $L$xts_dec_epilogue wrt ..imagebase - DD $L$xts_dec_info wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$cbc_dec_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase - DD $L$cbc_dec_tail wrt ..imagebase - DD 0 -$L$ctr_enc_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase - DD $L$ctr_enc_tail wrt ..imagebase - DD 0 -$L$xts_enc_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase - DD $L$xts_enc_tail wrt ..imagebase - DD 0 -$L$xts_dec_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase - DD $L$xts_dec_tail wrt ..imagebase - DD 0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm new file mode 100644 index 0000000000..434ba10ed6 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm @@ -0,0 +1,495 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + + + + + +global gcm_gmult_ssse3 +ALIGN 16 +gcm_gmult_ssse3: + +$L$gmult_seh_begin: + sub rsp,40 +$L$gmult_seh_allocstack: + movdqa XMMWORD[rsp],xmm6 +$L$gmult_seh_save_xmm6: + movdqa XMMWORD[16+rsp],xmm10 +$L$gmult_seh_save_xmm10: +$L$gmult_seh_prolog_end: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm10,XMMWORD[$L$reverse_bytes] + movdqa xmm2,XMMWORD[$L$low4_mask] + + +DB 102,65,15,56,0,194 + + + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + + + + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_1: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_1 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_2: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_2 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,6 +$L$oop_row_3: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_3 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + +DB 102,65,15,56,0,210 + movdqu XMMWORD[rcx],xmm2 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm10,XMMWORD[16+rsp] + add rsp,40 + DB 0F3h,0C3h ;repret +$L$gmult_seh_end: + + + + + + + + +global gcm_ghash_ssse3 +ALIGN 16 +gcm_ghash_ssse3: +$L$ghash_seh_begin: + + sub rsp,56 +$L$ghash_seh_allocstack: + movdqa XMMWORD[rsp],xmm6 +$L$ghash_seh_save_xmm6: + movdqa XMMWORD[16+rsp],xmm10 +$L$ghash_seh_save_xmm10: + movdqa XMMWORD[32+rsp],xmm11 +$L$ghash_seh_save_xmm11: +$L$ghash_seh_prolog_end: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm10,XMMWORD[$L$reverse_bytes] + movdqa xmm11,XMMWORD[$L$low4_mask] + + + and r9,-16 + + + +DB 102,65,15,56,0,194 + + + pxor xmm3,xmm3 +$L$oop_ghash: + + movdqu xmm1,XMMWORD[r8] +DB 102,65,15,56,0,202 + pxor xmm0,xmm1 + + + movdqa xmm1,xmm11 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm11 + + + + + pxor xmm2,xmm2 + + mov rax,5 +$L$oop_row_4: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_4 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_5: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_5 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,6 +$L$oop_row_6: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_6 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + movdqa xmm0,xmm2 + + + lea rdx,[((-256))+rdx] + + + lea r8,[16+r8] + sub r9,16 + jnz NEAR $L$oop_ghash + + +DB 102,65,15,56,0,194 + movdqu XMMWORD[rcx],xmm0 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm10,XMMWORD[16+rsp] + movdqa xmm11,XMMWORD[32+rsp] + add rsp,56 + DB 0F3h,0C3h ;repret +$L$ghash_seh_end: + + + +ALIGN 16 + + +$L$reverse_bytes: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + +$L$low4_mask: + DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f +section .pdata rdata align=4 +ALIGN 4 + DD $L$gmult_seh_begin wrt ..imagebase + DD $L$gmult_seh_end wrt ..imagebase + DD $L$gmult_seh_info wrt ..imagebase + + DD $L$ghash_seh_begin wrt ..imagebase + DD $L$ghash_seh_end wrt ..imagebase + DD $L$ghash_seh_info wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$gmult_seh_info: +DB 1 +DB $L$gmult_seh_prolog_end-$L$gmult_seh_begin +DB 5 +DB 0 + +DB $L$gmult_seh_save_xmm10-$L$gmult_seh_begin +DB 168 + DW 1 + +DB $L$gmult_seh_save_xmm6-$L$gmult_seh_begin +DB 104 + DW 0 + +DB $L$gmult_seh_allocstack-$L$gmult_seh_begin +DB 66 + +ALIGN 8 +$L$ghash_seh_info: +DB 1 +DB $L$ghash_seh_prolog_end-$L$ghash_seh_begin +DB 7 +DB 0 + +DB $L$ghash_seh_save_xmm11-$L$ghash_seh_begin +DB 184 + DW 2 + +DB $L$ghash_seh_save_xmm10-$L$ghash_seh_begin +DB 168 + DW 1 + +DB $L$ghash_seh_save_xmm6-$L$ghash_seh_begin +DB 104 + DW 0 + +DB $L$ghash_seh_allocstack-$L$ghash_seh_begin +DB 98 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm index 8ef16f513d..fdf914f284 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P @@ -18,13 +25,21 @@ $L$SEH_begin_gcm_gmult_4bit: mov rsi,rdx + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,280 + $L$gmult_prologue: movzx r8,BYTE[15+rdi] @@ -102,12 +117,16 @@ $L$break1: mov QWORD[rdi],r9 lea rsi,[((280+48))+rsp] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$gmult_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_gcm_gmult_4bit: global gcm_ghash_4bit @@ -123,13 +142,21 @@ $L$SEH_begin_gcm_ghash_4bit: mov rcx,r9 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,280 + $L$ghash_prologue: mov r14,rdx mov r15,rcx @@ -675,22 +702,32 @@ $L$outer_loop: mov QWORD[rdi],r9 lea rsi,[((280+48))+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$ghash_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_gcm_ghash_4bit: global gcm_init_clmul ALIGN 16 gcm_init_clmul: + $L$_init_clmul: $L$SEH_begin_gcm_init_clmul: @@ -850,10 +887,12 @@ DB 102,15,58,15,227,8 $L$SEH_end_gcm_init_clmul: DB 0F3h,0C3h ;repret + global gcm_gmult_clmul ALIGN 16 gcm_gmult_clmul: + $L$_gmult_clmul: movdqu xmm0,XMMWORD[rcx] movdqa xmm5,XMMWORD[$L$bswap_mask] @@ -901,10 +940,12 @@ DB 102,15,56,0,197 movdqu XMMWORD[rcx],xmm0 DB 0F3h,0C3h ;repret + global gcm_ghash_clmul ALIGN 32 gcm_ghash_clmul: + $L$_ghash_clmul: lea rax,[((-136))+rsp] $L$SEH_begin_gcm_ghash_clmul: @@ -1311,10 +1352,12 @@ DB 102,65,15,56,0,194 $L$SEH_end_gcm_ghash_clmul: DB 0F3h,0C3h ;repret + global gcm_init_avx ALIGN 32 gcm_init_avx: + $L$SEH_begin_gcm_init_avx: DB 0x48,0x83,0xec,0x18 @@ -1425,16 +1468,20 @@ $L$init_start_avx: $L$SEH_end_gcm_init_avx: DB 0F3h,0C3h ;repret + global gcm_gmult_avx ALIGN 32 gcm_gmult_avx: + jmp NEAR $L$_gmult_clmul + global gcm_ghash_avx ALIGN 32 gcm_ghash_avx: + lea rax,[((-136))+rsp] $L$SEH_begin_gcm_ghash_avx: @@ -1833,6 +1880,7 @@ $L$tail_no_xor_avx: $L$SEH_end_gcm_ghash_avx: DB 0F3h,0C3h ;repret + ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm index 0e9d2c604e..646201bb58 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 ALIGN 16 @@ -18,11 +25,17 @@ $L$SEH_begin_md5_block_asm_data_order: mov rdx,r8 + push rbp + push rbx + push r12 + push r14 + push r15 + $L$prologue: @@ -672,15 +685,22 @@ $L$end: mov DWORD[12+rbp],edx mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r12,QWORD[16+rsp] + mov rbx,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + add rsp,40 + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_md5_block_asm_data_order: EXTERN __imp_RtlVirtualUnwind diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm index 64db9d9518..215f5d2a49 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P @@ -21,6 +28,12 @@ $L$ONE_mont: DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +$L$ord: + DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +$L$ordK: + DQ 0xccd1c8aaee00bc4f + + global ecp_nistz256_neg @@ -34,9 +47,13 @@ $L$SEH_begin_ecp_nistz256_neg: mov rsi,rdx + push r12 + push r13 +$L$neg_body: + xor r8,r8 xor r9,r9 xor r10,r10 @@ -69,11 +86,17 @@ $L$SEH_begin_ecp_nistz256_neg: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$neg_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_neg: @@ -81,6 +104,1117 @@ $L$SEH_end_ecp_nistz256_neg: +global ecp_nistz256_ord_mul_mont + +ALIGN 32 +ecp_nistz256_ord_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_mul_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mul_body: + + mov rax,QWORD[rdx] + mov rbx,rdx + lea r14,[$L$ord] + mov r15,QWORD[$L$ordK] + + + mov rcx,rax + mul QWORD[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD[8+rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov r10,rdx + + mul QWORD[16+rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + + mov r13,r8 + imul r8,r15 + + mov r11,rdx + mul QWORD[24+rsi] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r12,rdx + + + mul QWORD[r14] + mov rbp,r8 + add r13,rax + mov rax,r8 + adc rdx,0 + mov rcx,rdx + + sub r10,r8 + sbb r8,0 + + mul QWORD[8+r14] + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,rbp + adc r10,rdx + mov rdx,rbp + adc r8,0 + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[8+rbx] + sbb rbp,rdx + + add r11,r8 + adc r12,rbp + adc r13,0 + + + mov rcx,rax + mul QWORD[rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r10,rbp + adc rdx,0 + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r9 + imul r9,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r12,rbp + adc rdx,0 + xor r8,r8 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + mul QWORD[r14] + mov rbp,r9 + add rcx,rax + mov rax,r9 + adc rcx,rdx + + sub r11,r9 + sbb r9,0 + + mul QWORD[8+r14] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc r11,rdx + mov rdx,rbp + adc r9,0 + + shl rax,32 + shr rdx,32 + sub r12,rax + mov rax,QWORD[16+rbx] + sbb rbp,rdx + + add r12,r9 + adc r13,rbp + adc r8,0 + + + mov rcx,rax + mul QWORD[rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r10 + imul r10,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r13,rbp + adc rdx,0 + xor r9,r9 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + mul QWORD[r14] + mov rbp,r10 + add rcx,rax + mov rax,r10 + adc rcx,rdx + + sub r12,r10 + sbb r10,0 + + mul QWORD[8+r14] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc r12,rdx + mov rdx,rbp + adc r10,0 + + shl rax,32 + shr rdx,32 + sub r13,rax + mov rax,QWORD[24+rbx] + sbb rbp,rdx + + add r13,r10 + adc r8,rbp + adc r9,0 + + + mov rcx,rax + mul QWORD[rsi] + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r11 + imul r11,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r8,rbp + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + mul QWORD[r14] + mov rbp,r11 + add rcx,rax + mov rax,r11 + adc rcx,rdx + + sub r13,r11 + sbb r11,0 + + mul QWORD[8+r14] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc r13,rdx + mov rdx,rbp + adc r11,0 + + shl rax,32 + shr rdx,32 + sub r8,rax + sbb rbp,rdx + + add r8,r11 + adc r9,rbp + adc r10,0 + + + mov rsi,r12 + sub r12,QWORD[r14] + mov r11,r13 + sbb r13,QWORD[8+r14] + mov rcx,r8 + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rsi + cmovc r13,r11 + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_mont: + + + + + + + +global ecp_nistz256_ord_sqr_mont + +ALIGN 32 +ecp_nistz256_ord_sqr_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_sqr_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqr_body: + + mov r8,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + lea rsi,[$L$ord] + mov rbx,rdx + jmp NEAR $L$oop_ord_sqr + +ALIGN 32 +$L$oop_ord_sqr: + + mov rbp,rax + mul r8 + mov r9,rax +DB 102,72,15,110,205 + mov rax,r14 + mov r10,rdx + + mul r8 + add r10,rax + mov rax,r15 +DB 102,73,15,110,214 + adc rdx,0 + mov r11,rdx + + mul r8 + add r11,rax + mov rax,r15 +DB 102,73,15,110,223 + adc rdx,0 + mov r12,rdx + + + mul r14 + mov r13,rax + mov rax,r14 + mov r14,rdx + + + mul rbp + add r11,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + + mul rbp + add r12,rax + adc rdx,0 + + add r12,r15 + adc r13,rdx + adc r14,0 + + + xor r15,r15 + mov rax,r8 + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + + mul rax + mov r8,rax +DB 102,72,15,126,200 + mov rbp,rdx + + mul rax + add r9,rbp + adc r10,rax +DB 102,72,15,126,208 + adc rdx,0 + mov rbp,rdx + + mul rax + add r11,rbp + adc r12,rax +DB 102,72,15,126,216 + adc rdx,0 + mov rbp,rdx + + mov rcx,r8 + imul r8,QWORD[32+rsi] + + mul rax + add r13,rbp + adc r14,rax + mov rax,QWORD[rsi] + adc r15,rdx + + + mul r8 + mov rbp,r8 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r10,r8 + sbb rbp,0 + + mul r8 + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,r8 + adc r10,rdx + mov rdx,r8 + adc rbp,0 + + mov rcx,r9 + imul r9,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[rsi] + sbb r8,rdx + + add r11,rbp + adc r8,0 + + + mul r9 + mov rbp,r9 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r11,r9 + sbb rbp,0 + + mul r9 + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,r9 + adc r11,rdx + mov rdx,r9 + adc rbp,0 + + mov rcx,r10 + imul r10,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r8,rax + mov rax,QWORD[rsi] + sbb r9,rdx + + add r8,rbp + adc r9,0 + + + mul r10 + mov rbp,r10 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r8,r10 + sbb rbp,0 + + mul r10 + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,r10 + adc r8,rdx + mov rdx,r10 + adc rbp,0 + + mov rcx,r11 + imul r11,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r9,rax + mov rax,QWORD[rsi] + sbb r10,rdx + + add r9,rbp + adc r10,0 + + + mul r11 + mov rbp,r11 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r9,r11 + sbb rbp,0 + + mul r11 + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + mov rdx,r11 + adc rbp,0 + + shl rax,32 + shr rdx,32 + sub r10,rax + sbb r11,rdx + + add r10,rbp + adc r11,0 + + + xor rdx,rdx + add r8,r12 + adc r9,r13 + mov r12,r8 + adc r10,r14 + adc r11,r15 + mov rax,r9 + adc rdx,0 + + + sub r8,QWORD[rsi] + mov r14,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r15,r11 + sbb r11,QWORD[24+rsi] + sbb rdx,0 + + cmovc r8,r12 + cmovnc rax,r9 + cmovnc r14,r10 + cmovnc r15,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqr + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],rax + pxor xmm1,xmm1 + mov QWORD[16+rdi],r14 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r15 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont: + + +ALIGN 32 +ecp_nistz256_ord_mul_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_mul_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mulx_body: + + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + lea r14,[(($L$ord-128))] + mov r15,QWORD[$L$ordK] + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mulx r11,rbp,r11 + add r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + mulx rax,rdx,r15 + adc r10,rbp + adc r11,rcx + adc r12,0 + + + xor r13,r13 + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r8,rcx + adox r9,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[8+rbx] + adcx r11,rcx + adox r12,rbp + adcx r12,r8 + adox r13,r8 + adc r13,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + mulx rax,rdx,r15 + adcx r12,rcx + adox r13,rbp + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[16+rbx] + adcx r12,rcx + adox r13,rbp + adcx r13,r9 + adox r8,r9 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + mulx rax,rdx,r15 + adcx r13,rcx + adox r8,rbp + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[24+rbx] + adcx r13,rcx + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + mulx rax,rdx,r15 + adcx r8,rcx + adox r9,rbp + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + lea r14,[128+r14] + mov rbx,r12 + adcx r8,rcx + adox r9,rbp + mov rdx,r13 + adcx r9,r11 + adox r10,r11 + adc r10,0 + + + + mov rcx,r8 + sub r12,QWORD[r14] + sbb r13,QWORD[8+r14] + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_montx: + + +ALIGN 32 +ecp_nistz256_ord_sqr_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_sqr_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqrx_body: + + mov rbx,rdx + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[$L$ord] + jmp NEAR $L$oop_ord_sqrx + +ALIGN 32 +$L$oop_ord_sqrx: + mulx r10,r9,r14 + mulx r11,rcx,r15 + mov rax,rdx +DB 102,73,15,110,206 + mulx r12,rbp,r8 + mov rdx,r14 + add r10,rcx +DB 102,73,15,110,215 + adc r11,rbp + adc r12,0 + xor r13,r13 + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + mulx r14,rcx,r8 + mov rdx,rax +DB 102,73,15,110,216 + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + + mulx rbp,r8,rdx +DB 102,72,15,126,202 + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx +DB 102,72,15,126,210 + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + mulx rbp,rcx,rdx +DB 0x67 +DB 102,72,15,126,218 + adox r11,rax + adcx r15,r15 + adox r12,rcx + adox r13,rbp + mulx rax,rcx,rdx + adox r14,rcx + adox r15,rax + + + mov rdx,r8 + mulx rcx,rdx,QWORD[32+rsi] + + xor rax,rax + mulx rbp,rcx,QWORD[rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r9,rcx + adox r10,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r11,rcx + adox r8,rbp + adcx r8,rax + + + mov rdx,r9 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r10,rcx + adcx r11,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r8,rcx + adcx r9,rbp + adox r9,rax + + + mov rdx,r10 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r11,rcx + adox r8,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r9,rcx + adox r10,rbp + adcx r10,rax + + + mov rdx,r11 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r8,rcx + adcx r9,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r10,rcx + adcx r11,rbp + adox r11,rax + + + add r12,r8 + adc r9,r13 + mov rdx,r12 + adc r10,r14 + adc r11,r15 + mov r14,r9 + adc rax,0 + + + sub r12,QWORD[rsi] + mov r15,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r8,r11 + sbb r11,QWORD[24+rsi] + sbb rax,0 + + cmovnc rdx,r12 + cmovnc r14,r9 + cmovnc r15,r10 + cmovnc r8,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqrx + + mov QWORD[rdi],rdx + mov QWORD[8+rdi],r14 + pxor xmm1,xmm1 + mov QWORD[16+rdi],r15 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r8 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_montx: + + + + + + global ecp_nistz256_mul_mont ALIGN 32 @@ -94,13 +1228,26 @@ $L$SEH_begin_ecp_nistz256_mul_mont: mov rdx,r8 + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 $L$mul_mont: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +$L$mul_body: + cmp ecx,0x80100 + je NEAR $L$mul_montx mov rbx,rdx mov rax,QWORD[rdx] mov r9,QWORD[rsi] @@ -109,16 +1256,39 @@ $L$mul_mont: mov r12,QWORD[24+rsi] call __ecp_nistz256_mul_montq + jmp NEAR $L$mul_mont_done + +ALIGN 32 +$L$mul_montx: + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_mul_montx $L$mul_mont_done: - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_mul_mont: @@ -126,6 +1296,7 @@ ALIGN 32 __ecp_nistz256_mul_montq: + mov rbp,rax mul r9 mov r14,QWORD[(($L$poly+8))] @@ -344,6 +1515,7 @@ __ecp_nistz256_mul_montq: + global ecp_nistz256_sqr_mont ALIGN 32 @@ -356,33 +1528,68 @@ $L$SEH_begin_ecp_nistz256_sqr_mont: mov rsi,rdx + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +$L$sqr_body: + cmp ecx,0x80100 + je NEAR $L$sqr_montx mov rax,QWORD[rsi] mov r14,QWORD[8+rsi] mov r15,QWORD[16+rsi] mov r8,QWORD[24+rsi] call __ecp_nistz256_sqr_montq + jmp NEAR $L$sqr_mont_done + +ALIGN 32 +$L$sqr_montx: + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_sqr_montx $L$sqr_mont_done: - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqr_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_sqr_mont: ALIGN 32 __ecp_nistz256_sqr_montq: + mov r13,rax mul r14 mov r9,rax @@ -543,10 +1750,310 @@ __ecp_nistz256_sqr_montq: +ALIGN 32 +__ecp_nistz256_mul_montx: + + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mov r14,32 + xor r13,r13 + mulx r11,rbp,r11 + mov r15,QWORD[(($L$poly+24))] + adc r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + adc r10,rbp + shlx rbp,r8,r14 + adc r11,rcx + shrx rcx,r8,r14 + adc r12,0 + + + + add r9,rbp + adc r10,rcx + + mulx rbp,rcx,r15 + mov rdx,QWORD[8+rbx] + adc r11,rcx + adc r12,rbp + adc r13,0 + xor r8,r8 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + adcx r12,rcx + shlx rcx,r9,r14 + adox r13,rbp + shrx rbp,r9,r14 + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + + add r10,rcx + adc r11,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[16+rbx] + adc r12,rcx + adc r13,rbp + adc r8,0 + xor r9,r9 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + adcx r13,rcx + shlx rcx,r10,r14 + adox r8,rbp + shrx rbp,r10,r14 + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + + add r11,rcx + adc r12,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[24+rbx] + adc r13,rcx + adc r8,rbp + adc r9,0 + xor r10,r10 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + adcx r8,rcx + shlx rcx,r11,r14 + adox r9,rbp + shrx rbp,r11,r14 + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + + add r12,rcx + adc r13,rbp + + mulx rbp,rcx,r15 + mov rbx,r12 + mov r14,QWORD[(($L$poly+8))] + adc r8,rcx + mov rdx,r13 + adc r9,rbp + adc r10,0 + + + + xor eax,eax + mov rcx,r8 + sbb r12,-1 + sbb r13,r14 + sbb r8,0 + mov rbp,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,rbp + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sqr_montx: + + mulx r10,r9,r14 + mulx r11,rcx,r15 + xor eax,eax + adc r10,rcx + mulx r12,rbp,r8 + mov rdx,r14 + adc r11,rbp + adc r12,0 + xor r13,r13 + + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + + mulx r14,rcx,r8 + mov rdx,QWORD[((0+128))+rsi] + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + mulx rbp,r8,rdx + mov rdx,QWORD[((8+128))+rsi] + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx + mov rdx,QWORD[((16+128))+rsi] + adcx r13,r13 + adox r10,rcx + adcx r14,r14 +DB 0x67 + mulx rbp,rcx,rdx + mov rdx,QWORD[((24+128))+rsi] + adox r11,rax + adcx r15,r15 + adox r12,rcx + mov rsi,32 + adox r13,rbp +DB 0x67,0x67 + mulx rax,rcx,rdx + mov rdx,QWORD[(($L$poly+24))] + adox r14,rcx + shlx rcx,r8,rsi + adox r15,rax + shrx rax,r8,rsi + mov rbp,rdx + + + add r9,rcx + adc r10,rax + + mulx r8,rcx,r8 + adc r11,rcx + shlx rcx,r9,rsi + adc r8,0 + shrx rax,r9,rsi + + + add r10,rcx + adc r11,rax + + mulx r9,rcx,r9 + adc r8,rcx + shlx rcx,r10,rsi + adc r9,0 + shrx rax,r10,rsi + + + add r11,rcx + adc r8,rax + + mulx r10,rcx,r10 + adc r9,rcx + shlx rcx,r11,rsi + adc r10,0 + shrx rax,r11,rsi + + + add r8,rcx + adc r9,rax + + mulx r11,rcx,r11 + adc r10,rcx + adc r11,0 + + xor rdx,rdx + add r12,r8 + mov rsi,QWORD[(($L$poly+8))] + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,r11 + mov r9,r13 + adc rdx,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov r11,r15 + sbb r15,rbp + sbb rdx,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,r11 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + DB 0F3h,0C3h ;repret + + + + global ecp_nistz256_select_w5 ALIGN 32 ecp_nistz256_select_w5: + lea rax,[OPENSSL_ia32cap_P] mov rax,QWORD[8+rax] test eax,32 @@ -625,9 +2132,10 @@ $L$select_loop_sse_w5: movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_select_w5: DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_select_w5: + @@ -635,6 +2143,7 @@ global ecp_nistz256_select_w7 ALIGN 32 ecp_nistz256_select_w7: + lea rax,[OPENSSL_ia32cap_P] mov rax,QWORD[8+rax] test eax,32 @@ -702,17 +2211,20 @@ $L$select_loop_sse_w7: movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_select_w7: DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_select_w7: + ALIGN 32 ecp_nistz256_avx2_select_w5: + $L$avx2_select_w5: vzeroupper lea rax,[((-136))+rsp] + mov r11,rsp $L$SEH_begin_ecp_nistz256_avx2_select_w5: DB 0x48,0x8d,0x60,0xe0 DB 0xc5,0xf8,0x29,0x70,0xe0 @@ -786,10 +2298,11 @@ $L$select_loop_avx2_w5: movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_avx2_select_w5: + lea rsp,[r11] DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_select_w5: + @@ -797,8 +2310,10 @@ global ecp_nistz256_avx2_select_w7 ALIGN 32 ecp_nistz256_avx2_select_w7: + $L$avx2_select_w7: vzeroupper + mov r11,rsp lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_avx2_select_w7: DB 0x48,0x8d,0x60,0xe0 @@ -888,13 +2403,15 @@ $L$select_loop_avx2_w7: movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_avx2_select_w7: + lea rsp,[r11] DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_select_w7: + ALIGN 32 __ecp_nistz256_add_toq: + xor r11,r11 add r12,QWORD[rbx] adc r13,QWORD[8+rbx] @@ -925,8 +2442,10 @@ __ecp_nistz256_add_toq: + ALIGN 32 __ecp_nistz256_sub_fromq: + sub r12,QWORD[rbx] sbb r13,QWORD[8+rbx] mov rax,r12 @@ -956,8 +2475,10 @@ __ecp_nistz256_sub_fromq: + ALIGN 32 __ecp_nistz256_subq: + sub rax,r12 sbb rbp,r13 mov r12,rax @@ -983,8 +2504,10 @@ __ecp_nistz256_subq: + ALIGN 32 __ecp_nistz256_mul_by_2q: + xor r11,r11 add r12,r12 adc r13,r13 @@ -1013,6 +2536,7 @@ __ecp_nistz256_mul_by_2q: DB 0F3h,0C3h ;repret + global ecp_nistz256_point_double ALIGN 32 @@ -1025,14 +2549,28 @@ $L$SEH_begin_ecp_nistz256_point_double: mov rsi,rdx + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_doublex push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*5+8 +$L$point_doubleq_body: + $L$point_double_shortcutq: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi @@ -1214,16 +2752,27 @@ DB 102,72,15,126,203 DB 102,72,15,126,207 call __ecp_nistz256_sub_fromq - add rsp,32*5+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doubleq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_double: global ecp_nistz256_point_add @@ -1238,14 +2787,28 @@ $L$SEH_begin_ecp_nistz256_point_add: mov rdx,r8 + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_addx push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*18+8 +$L$point_addq_body: + movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] @@ -1390,15 +2953,22 @@ DB 102,73,15,110,220 or r12,r8 or r12,r9 -DB 0x3e - jnz NEAR $L$add_proceedq DB 102,73,15,126,208 DB 102,73,15,126,217 - test r8,r8 + or r12,r8 +DB 0x3e jnz NEAR $L$add_proceedq + + + test r9,r9 jz NEAR $L$add_doubleq + + + + + DB 102,72,15,126,199 pxor xmm0,xmm0 movdqu XMMWORD[rdi],xmm0 @@ -1414,8 +2984,10 @@ $L$add_doubleq: DB 102,72,15,126,206 DB 102,72,15,126,199 add rsp,416 + jmp NEAR $L$point_double_shortcutq + ALIGN 32 $L$add_proceedq: mov rax,QWORD[((0+64))+rsp] @@ -1621,16 +3193,27 @@ DB 102,72,15,126,199 movdqu XMMWORD[48+rdi],xmm3 $L$add_doneq: - add rsp,32*18+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_add: global ecp_nistz256_point_add_affine @@ -1645,14 +3228,28 @@ $L$SEH_begin_ecp_nistz256_point_add_affine: mov rdx,r8 + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_add_affinex push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*15+8 +$L$add_affineq_body: + movdqu xmm0,XMMWORD[rsi] mov rbx,rdx movdqu xmm1,XMMWORD[16+rsi] @@ -1934,14 +3531,1454 @@ DB 102,72,15,126,199 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 - add rsp,32*15+8 + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affineq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_add_affine: + +ALIGN 32 +__ecp_nistz256_add_tox: + + xor r11,r11 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromx: + + xor r11,r11 + sbb r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,0 + + xor r10,r10 + adc r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + + bt r11,0 + cmovnc r12,rax + cmovnc r13,rbp + mov QWORD[rdi],r12 + cmovnc r8,rcx + mov QWORD[8+rdi],r13 + cmovnc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_subx: + + xor r11,r11 + sbb rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,0 + + xor r9,r9 + adc rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + + bt r11,0 + cmovc r12,rax + cmovc r13,rbp + cmovc r8,rcx + cmovc r9,r10 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2x: + + xor r11,r11 + adc r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +ecp_nistz256_point_doublex: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_doublex: + mov rdi,rcx + mov rsi,rdx + + + +$L$point_doublex: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doublex_body: + +$L$point_double_shortcutx: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-128))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-128))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rdx,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subx + + mov rdx,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-128))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doublex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_doublex: + +ALIGN 32 +ecp_nistz256_point_addx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_addx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$point_addx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addx_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 +DB 102,72,15,110,199 + + lea rsi,[((64-128))+rsi] + mov QWORD[((544+0))+rsp],rdx + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rdx,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-128))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((-128+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 102,73,15,126,208 +DB 102,73,15,126,217 + or r12,r8 +DB 0x3e + jnz NEAR $L$add_proceedx + + + + test r9,r9 + jz NEAR $L$add_doublex + + + + + + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_donex + +ALIGN 32 +$L$add_doublex: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + + jmp NEAR $L$point_double_shortcutx + + +ALIGN 32 +$L$add_proceedx: + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((-128+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_donex: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_addx: + +ALIGN 32 +ecp_nistz256_point_add_affinex: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affinex: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$point_add_affinex: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affinex_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-128))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rdx,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-128))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((-128+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((-128+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affinex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_add_affinex: +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[16+rax] + + mov r12,QWORD[((-8))+rax] + mov r13,QWORD[((-16))+rax] + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq pop r15 pop r14 pop r13 pop r12 - pop rbx pop rbp - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + pop rbx + pop rdi + pop rsi DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_add_affine: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w5 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w5 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w7 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w7 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_avx2_select_w5 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_avx2_select_w5 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_avx2_select_w7 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_avx2_select_w7 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ecp_nistz256_neg: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_ord_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_mul_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_select_wX: +DB 0x01,0x33,0x16,0x00 +DB 0x33,0xf8,0x09,0x00 +DB 0x2e,0xe8,0x08,0x00 +DB 0x29,0xd8,0x07,0x00 +DB 0x24,0xc8,0x06,0x00 +DB 0x1f,0xb8,0x05,0x00 +DB 0x1a,0xa8,0x04,0x00 +DB 0x15,0x98,0x03,0x00 +DB 0x10,0x88,0x02,0x00 +DB 0x0c,0x78,0x01,0x00 +DB 0x08,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_avx2_select_wX: +DB 0x01,0x36,0x17,0x0b +DB 0x36,0xf8,0x09,0x00 +DB 0x31,0xe8,0x08,0x00 +DB 0x2c,0xd8,0x07,0x00 +DB 0x27,0xc8,0x06,0x00 +DB 0x22,0xb8,0x05,0x00 +DB 0x1d,0xa8,0x04,0x00 +DB 0x18,0x98,0x03,0x00 +DB 0x13,0x88,0x02,0x00 +DB 0x0e,0x78,0x01,0x00 +DB 0x09,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +DB 0x00,0xb3,0x00,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase + DD 32*15+56,0 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_doublex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_addx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affinex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase + DD 32*15+56,0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm new file mode 100644 index 0000000000..563699d59d --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm @@ -0,0 +1,339 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + + +global beeu_mod_inverse_vartime +ALIGN 32 +beeu_mod_inverse_vartime: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_beeu_mod_inverse_vartime: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + push rbx + + push rsi + + + sub rsp,80 + + mov QWORD[rsp],rdi + + + mov r8,1 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor rdi,rdi + + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + xor rbp,rbp + + + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu XMMWORD[48+rsp],xmm0 + vmovdqu XMMWORD[64+rsp],xmm1 + + vmovdqu xmm0,XMMWORD[rdx] + vmovdqu xmm1,XMMWORD[16+rdx] + vmovdqu XMMWORD[16+rsp],xmm0 + vmovdqu XMMWORD[32+rsp],xmm1 + +$L$beeu_loop: + xor rbx,rbx + or rbx,QWORD[48+rsp] + or rbx,QWORD[56+rsp] + or rbx,QWORD[64+rsp] + or rbx,QWORD[72+rsp] + jz NEAR $L$beeu_loop_end + + + + + + + + + + + mov rcx,1 + + +$L$beeu_shift_loop_XB: + mov rbx,rcx + and rbx,QWORD[48+rsp] + jnz NEAR $L$beeu_shift_loop_end_XB + + + mov rbx,1 + and rbx,r8 + jz NEAR $L$shift1_0 + add r8,QWORD[rdx] + adc r9,QWORD[8+rdx] + adc r10,QWORD[16+rdx] + adc r11,QWORD[24+rdx] + adc rdi,0 + +$L$shift1_0: + shrd r8,r9,1 + shrd r9,r10,1 + shrd r10,r11,1 + shrd r11,rdi,1 + shr rdi,1 + + shl rcx,1 + + + + + + cmp rcx,0x8000000 + jne NEAR $L$beeu_shift_loop_XB + +$L$beeu_shift_loop_end_XB: + bsf rcx,rcx + test rcx,rcx + jz NEAR $L$beeu_no_shift_XB + + + + mov rax,QWORD[((8+48))+rsp] + mov rbx,QWORD[((16+48))+rsp] + mov rsi,QWORD[((24+48))+rsp] + + shrd QWORD[((0+48))+rsp],rax,cl + shrd QWORD[((8+48))+rsp],rbx,cl + shrd QWORD[((16+48))+rsp],rsi,cl + + shr rsi,cl + mov QWORD[((24+48))+rsp],rsi + + +$L$beeu_no_shift_XB: + + mov rcx,1 + + +$L$beeu_shift_loop_YA: + mov rbx,rcx + and rbx,QWORD[16+rsp] + jnz NEAR $L$beeu_shift_loop_end_YA + + + mov rbx,1 + and rbx,r12 + jz NEAR $L$shift1_1 + add r12,QWORD[rdx] + adc r13,QWORD[8+rdx] + adc r14,QWORD[16+rdx] + adc r15,QWORD[24+rdx] + adc rbp,0 + +$L$shift1_1: + shrd r12,r13,1 + shrd r13,r14,1 + shrd r14,r15,1 + shrd r15,rbp,1 + shr rbp,1 + + shl rcx,1 + + + + + + cmp rcx,0x8000000 + jne NEAR $L$beeu_shift_loop_YA + +$L$beeu_shift_loop_end_YA: + bsf rcx,rcx + test rcx,rcx + jz NEAR $L$beeu_no_shift_YA + + + + mov rax,QWORD[((8+16))+rsp] + mov rbx,QWORD[((16+16))+rsp] + mov rsi,QWORD[((24+16))+rsp] + + shrd QWORD[((0+16))+rsp],rax,cl + shrd QWORD[((8+16))+rsp],rbx,cl + shrd QWORD[((16+16))+rsp],rsi,cl + + shr rsi,cl + mov QWORD[((24+16))+rsp],rsi + + +$L$beeu_no_shift_YA: + + mov rax,QWORD[48+rsp] + mov rbx,QWORD[56+rsp] + mov rsi,QWORD[64+rsp] + mov rcx,QWORD[72+rsp] + sub rax,QWORD[16+rsp] + sbb rbx,QWORD[24+rsp] + sbb rsi,QWORD[32+rsp] + sbb rcx,QWORD[40+rsp] + jnc NEAR $L$beeu_B_bigger_than_A + + + mov rax,QWORD[16+rsp] + mov rbx,QWORD[24+rsp] + mov rsi,QWORD[32+rsp] + mov rcx,QWORD[40+rsp] + sub rax,QWORD[48+rsp] + sbb rbx,QWORD[56+rsp] + sbb rsi,QWORD[64+rsp] + sbb rcx,QWORD[72+rsp] + mov QWORD[16+rsp],rax + mov QWORD[24+rsp],rbx + mov QWORD[32+rsp],rsi + mov QWORD[40+rsp],rcx + + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,r11 + adc rbp,rdi + jmp NEAR $L$beeu_loop + +$L$beeu_B_bigger_than_A: + + mov QWORD[48+rsp],rax + mov QWORD[56+rsp],rbx + mov QWORD[64+rsp],rsi + mov QWORD[72+rsp],rcx + + + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rdi,rbp + + jmp NEAR $L$beeu_loop + +$L$beeu_loop_end: + + + + + mov rbx,QWORD[16+rsp] + sub rbx,1 + or rbx,QWORD[24+rsp] + or rbx,QWORD[32+rsp] + or rbx,QWORD[40+rsp] + + jnz NEAR $L$beeu_err + + + + + mov r8,QWORD[rdx] + mov r9,QWORD[8+rdx] + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + xor rdi,rdi + +$L$beeu_reduction_loop: + mov QWORD[16+rsp],r12 + mov QWORD[24+rsp],r13 + mov QWORD[32+rsp],r14 + mov QWORD[40+rsp],r15 + mov QWORD[48+rsp],rbp + + + sub r12,r8 + sbb r13,r9 + sbb r14,r10 + sbb r15,r11 + sbb rbp,0 + + + cmovc r12,QWORD[16+rsp] + cmovc r13,QWORD[24+rsp] + cmovc r14,QWORD[32+rsp] + cmovc r15,QWORD[40+rsp] + jnc NEAR $L$beeu_reduction_loop + + + sub r8,r12 + sbb r9,r13 + sbb r10,r14 + sbb r11,r15 + +$L$beeu_save: + + mov rdi,QWORD[rsp] + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + + mov rax,1 + jmp NEAR $L$beeu_finish + +$L$beeu_err: + + xor rax,rax + +$L$beeu_finish: + add rsp,80 + + pop rsi + + pop rbx + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +$L$SEH_end_beeu_mod_inverse_vartime: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm index 4c03791b48..89b91de10d 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -12,59 +19,40 @@ global CRYPTO_rdrand ALIGN 16 CRYPTO_rdrand: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_CRYPTO_rdrand: - mov rdi,rcx - xor rax,rax - - -DB 0x48,0x0f,0xc7,0xf1 +DB 73,15,199,240 adc rax,rax - mov QWORD[rdi],rcx - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + mov QWORD[rcx],r8 DB 0F3h,0C3h ;repret + + global CRYPTO_rdrand_multiple8_buf ALIGN 16 CRYPTO_rdrand_multiple8_buf: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_CRYPTO_rdrand_multiple8_buf: - mov rdi,rcx - mov rsi,rdx - - test rsi,rsi + test rdx,rdx jz NEAR $L$out - mov rdx,8 + mov r8,8 $L$loop: - - -DB 0x48,0x0f,0xc7,0xf1 +DB 73,15,199,241 jnc NEAR $L$err - mov QWORD[rdi],rcx - add rdi,rdx - sub rsi,rdx + mov QWORD[rcx],r9 + add rcx,r8 + sub rdx,r8 jnz NEAR $L$loop $L$out: mov rax,1 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$err: xor rax,rax - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + + diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm index a06e6f6cd6..74e2705cb9 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -103,7 +110,7 @@ $L$sqr_1024_no_n_copy: vmovdqu ymm8,YMMWORD[((256-128))+rsi] lea rbx,[192+rsp] - vpbroadcastq ymm15,QWORD[$L$and_mask] + vmovdqu ymm15,YMMWORD[$L$and_mask] jmp NEAR $L$OOP_GRANDE_SQR_1024 ALIGN 32 @@ -891,10 +898,10 @@ $L$oop_mul_1024: vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx] vpaddq ymm6,ymm6,ymm12 vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx] - vpblendd ymm9,ymm9,ymm14,3 + vpblendd ymm12,ymm9,ymm14,3 vpaddq ymm7,ymm7,ymm13 vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx] - vpaddq ymm3,ymm3,ymm9 + vpaddq ymm3,ymm3,ymm12 vpaddq ymm8,ymm8,ymm0 mov rax,rbx @@ -907,7 +914,9 @@ $L$oop_mul_1024: vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi] mov rax,r10 + vpblendd ymm9,ymm9,ymm14,0xfc imul eax,r8d + vpaddq ymm4,ymm4,ymm9 and eax,0x1fffffff imul rbx,QWORD[((16-128))+rsi] @@ -1136,7 +1145,6 @@ $L$oop_mul_1024: dec r14d jnz NEAR $L$oop_mul_1024 - vpermq ymm15,ymm15,0 vpaddq ymm0,ymm12,YMMWORD[rsp] vpsrlq ymm12,ymm0,29 @@ -1289,6 +1297,7 @@ global rsaz_1024_red2norm_avx2 ALIGN 32 rsaz_1024_red2norm_avx2: + sub rdx,-128 xor rax,rax mov r8,QWORD[((-128))+rdx] @@ -1482,10 +1491,12 @@ rsaz_1024_red2norm_avx2: DB 0F3h,0C3h ;repret + global rsaz_1024_norm2red_avx2 ALIGN 32 rsaz_1024_norm2red_avx2: + sub rcx,-128 mov r8,QWORD[rdx] mov eax,0x1fffffff @@ -1639,10 +1650,12 @@ rsaz_1024_norm2red_avx2: mov QWORD[184+rcx],r8 DB 0F3h,0C3h ;repret + global rsaz_1024_scatter5_avx2 ALIGN 32 rsaz_1024_scatter5_avx2: + vzeroupper vmovdqu ymm5,YMMWORD[$L$scatter_permd] shl r8d,4 @@ -1664,6 +1677,7 @@ $L$oop_scatter_1024: DB 0F3h,0C3h ;repret + global rsaz_1024_gather5_avx2 ALIGN 32 @@ -1809,21 +1823,9 @@ $L$oop_gather_1024: $L$SEH_end_rsaz_1024_gather5: -EXTERN OPENSSL_ia32cap_P -global rsaz_avx2_eligible - -ALIGN 32 -rsaz_avx2_eligible: - lea rax,[OPENSSL_ia32cap_P] - mov eax,DWORD[8+rax] - and eax,32 - shr eax,5 - DB 0F3h,0C3h ;repret - - ALIGN 64 $L$and_mask: - DQ 0x1fffffff,0x1fffffff,0x1fffffff,-1 + DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff $L$scatter_permd: DD 0,2,4,6,7,7,7,7 $L$gather_permd: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm index 65b040fb43..62dcc62c25 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P @@ -19,6 +26,7 @@ $L$SEH_begin_sha1_block_data_order: mov rdx,r8 + lea r10,[OPENSSL_ia32cap_P] mov r9d,DWORD[r10] mov r8d,DWORD[4+r10] @@ -35,17 +43,24 @@ $L$SEH_begin_sha1_block_data_order: ALIGN 16 $L$ialu: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + mov r8,rdi sub rsp,72 mov r9,rsi and rsp,-64 mov r10,rdx mov QWORD[64+rsp],rax + $L$prologue: mov esi,DWORD[r8] @@ -1240,16 +1255,24 @@ $L$loop: jnz NEAR $L$loop mov rsi,QWORD[64+rsp] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order: ALIGN 16 @@ -1264,12 +1287,19 @@ $L$SEH_begin_sha1_block_data_order_ssse3: _ssse3_shortcut: + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] movaps XMMWORD[(-40-96)+r11],xmm6 movaps XMMWORD[(-40-80)+r11],xmm7 @@ -2439,15 +2469,22 @@ $L$done_ssse3: movaps xmm10,XMMWORD[((-40-32))+r11] movaps xmm11,XMMWORD[((-40-16))+r11] mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_ssse3: ALIGN 16 @@ -2462,12 +2499,19 @@ $L$SEH_begin_sha1_block_data_order_avx: _avx_shortcut: + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] vzeroupper vmovaps XMMWORD[(-40-96)+r11],xmm6 @@ -3577,15 +3621,22 @@ $L$done_avx: movaps xmm10,XMMWORD[((-40-32))+r11] movaps xmm11,XMMWORD[((-40-16))+r11] mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_avx: ALIGN 64 K_XX_XX: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm index 6e3d1541ac..68c74cc1b9 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -19,6 +26,7 @@ $L$SEH_begin_sha256_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] @@ -31,12 +39,19 @@ $L$SEH_begin_sha256_block_data_order: test r10d,512 jnz NEAR $L$ssse3_shortcut mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,16*4+4*8 lea rdx,[rdx*4+rsi] @@ -44,7 +59,8 @@ $L$SEH_begin_sha256_block_data_order: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],rax + mov QWORD[88+rsp],rax + $L$prologue: mov eax,DWORD[rdi] @@ -1708,18 +1724,27 @@ $L$rounds_16_xx: mov DWORD[28+rdi],r11d jb NEAR $L$loop - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order: ALIGN 64 @@ -1780,14 +1805,22 @@ $L$SEH_begin_sha256_block_data_order_ssse3: mov rdx,r8 + $L$ssse3_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -1795,7 +1828,8 @@ $L$ssse3_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],rax + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -2865,22 +2899,31 @@ DB 102,15,58,15,249,4 mov DWORD[28+rdi],r11d jb NEAR $L$loop_ssse3 - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_ssse3: ALIGN 64 @@ -2894,14 +2937,22 @@ $L$SEH_begin_sha256_block_data_order_avx: mov rdx,r8 + $L$avx_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -2909,7 +2960,8 @@ $L$avx_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],rax + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -3940,23 +3992,32 @@ $L$avx_00_47: mov DWORD[28+rdi],r11d jb NEAR $L$loop_avx - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + vzeroupper movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm index d0d7a43fbe..33dc2c2ede 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -19,24 +26,30 @@ $L$SEH_begin_sha512_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] - test r10d,2048 - jnz NEAR $L$xop_shortcut and r9d,1073741824 and r10d,268435968 or r10d,r9d cmp r10d,1342177792 je NEAR $L$avx_shortcut mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,16*8+4*8 lea rdx,[rdx*8+rsi] @@ -44,7 +57,8 @@ $L$SEH_begin_sha512_block_data_order: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],rax + mov QWORD[152+rsp],rax + $L$prologue: mov rax,QWORD[rdi] @@ -1708,18 +1722,27 @@ $L$rounds_16_xx: mov QWORD[56+rdi],r11 jb NEAR $L$loop - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order: ALIGN 64 @@ -1813,1112 +1836,6 @@ DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 -ALIGN 64 -sha512_block_data_order_xop: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha512_block_data_order_xop: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - -$L$xop_shortcut: - mov rax,rsp - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - shl rdx,4 - sub rsp,256 - lea rdx,[rdx*8+rsi] - and rsp,-64 - mov QWORD[((128+0))+rsp],rdi - mov QWORD[((128+8))+rsp],rsi - mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],rax - movaps XMMWORD[(128+32)+rsp],xmm6 - movaps XMMWORD[(128+48)+rsp],xmm7 - movaps XMMWORD[(128+64)+rsp],xmm8 - movaps XMMWORD[(128+80)+rsp],xmm9 - movaps XMMWORD[(128+96)+rsp],xmm10 - movaps XMMWORD[(128+112)+rsp],xmm11 -$L$prologue_xop: - - vzeroupper - mov rax,QWORD[rdi] - mov rbx,QWORD[8+rdi] - mov rcx,QWORD[16+rdi] - mov rdx,QWORD[24+rdi] - mov r8,QWORD[32+rdi] - mov r9,QWORD[40+rdi] - mov r10,QWORD[48+rdi] - mov r11,QWORD[56+rdi] - jmp NEAR $L$loop_xop -ALIGN 16 -$L$loop_xop: - vmovdqa xmm11,XMMWORD[((K512+1280))] - vmovdqu xmm0,XMMWORD[rsi] - lea rbp,[((K512+128))] - vmovdqu xmm1,XMMWORD[16+rsi] - vmovdqu xmm2,XMMWORD[32+rsi] - vpshufb xmm0,xmm0,xmm11 - vmovdqu xmm3,XMMWORD[48+rsi] - vpshufb xmm1,xmm1,xmm11 - vmovdqu xmm4,XMMWORD[64+rsi] - vpshufb xmm2,xmm2,xmm11 - vmovdqu xmm5,XMMWORD[80+rsi] - vpshufb xmm3,xmm3,xmm11 - vmovdqu xmm6,XMMWORD[96+rsi] - vpshufb xmm4,xmm4,xmm11 - vmovdqu xmm7,XMMWORD[112+rsi] - vpshufb xmm5,xmm5,xmm11 - vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] - vpshufb xmm6,xmm6,xmm11 - vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] - vpshufb xmm7,xmm7,xmm11 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] - vmovdqa XMMWORD[rsp],xmm8 - vpaddq xmm8,xmm4,XMMWORD[rbp] - vmovdqa XMMWORD[16+rsp],xmm9 - vpaddq xmm9,xmm5,XMMWORD[32+rbp] - vmovdqa XMMWORD[32+rsp],xmm10 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - vmovdqa XMMWORD[48+rsp],xmm11 - vpaddq xmm11,xmm7,XMMWORD[96+rbp] - vmovdqa XMMWORD[64+rsp],xmm8 - mov r14,rax - vmovdqa XMMWORD[80+rsp],xmm9 - mov rdi,rbx - vmovdqa XMMWORD[96+rsp],xmm10 - xor rdi,rcx - vmovdqa XMMWORD[112+rsp],xmm11 - mov r13,r8 - jmp NEAR $L$xop_00_47 - -ALIGN 16 -$L$xop_00_47: - add rbp,256 - vpalignr xmm8,xmm1,xmm0,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm5,xmm4,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm0,xmm0,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,223,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm7,6 - add rdx,r11 - add r11,rdi - vpaddq xmm0,xmm0,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm0,xmm0,xmm11 - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[rsp],xmm10 - vpalignr xmm8,xmm2,xmm1,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm6,xmm5,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm1,xmm1,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,216,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm0,6 - add rbx,r9 - add r9,rdi - vpaddq xmm1,xmm1,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm1,xmm1,xmm11 - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[16+rsp],xmm10 - vpalignr xmm8,xmm3,xmm2,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm7,xmm6,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm2,xmm2,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,217,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm1,6 - add r11,rdx - add rdx,rdi - vpaddq xmm2,xmm2,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm2,xmm2,xmm11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[32+rsp],xmm10 - vpalignr xmm8,xmm4,xmm3,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm0,xmm7,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm3,xmm3,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,218,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm2,6 - add r9,rbx - add rbx,rdi - vpaddq xmm3,xmm3,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm3,xmm3,xmm11 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[48+rsp],xmm10 - vpalignr xmm8,xmm5,xmm4,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm1,xmm0,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm4,xmm4,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,219,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm3,6 - add rdx,r11 - add r11,rdi - vpaddq xmm4,xmm4,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm4,xmm4,xmm11 - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm4,XMMWORD[rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[64+rsp],xmm10 - vpalignr xmm8,xmm6,xmm5,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm2,xmm1,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm5,xmm5,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,220,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm4,6 - add rbx,r9 - add r9,rdi - vpaddq xmm5,xmm5,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm5,xmm5,xmm11 - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm5,XMMWORD[32+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[80+rsp],xmm10 - vpalignr xmm8,xmm7,xmm6,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm3,xmm2,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm6,xmm6,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,221,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm5,6 - add r11,rdx - add rdx,rdi - vpaddq xmm6,xmm6,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm6,xmm6,xmm11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[96+rsp],xmm10 - vpalignr xmm8,xmm0,xmm7,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm4,xmm3,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm7,xmm7,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,222,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm6,6 - add r9,rbx - add rbx,rdi - vpaddq xmm7,xmm7,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm7,xmm7,xmm11 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm7,XMMWORD[96+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[112+rsp],xmm10 - cmp BYTE[135+rbp],0 - jne NEAR $L$xop_00_47 - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - mov rdi,QWORD[((128+0))+rsp] - mov rax,r14 - - add rax,QWORD[rdi] - lea rsi,[128+rsi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - add r10,QWORD[48+rdi] - add r11,QWORD[56+rdi] - - cmp rsi,QWORD[((128+16))+rsp] - - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - jb NEAR $L$loop_xop - - mov rsi,QWORD[((128+24))+rsp] - vzeroupper - movaps xmm6,XMMWORD[((128+32))+rsp] - movaps xmm7,XMMWORD[((128+48))+rsp] - movaps xmm8,XMMWORD[((128+64))+rsp] - movaps xmm9,XMMWORD[((128+80))+rsp] - movaps xmm10,XMMWORD[((128+96))+rsp] - movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[((-48))+rsi] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] -$L$epilogue_xop: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_sha512_block_data_order_xop: - ALIGN 64 sha512_block_data_order_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue @@ -2930,14 +1847,22 @@ $L$SEH_begin_sha512_block_data_order_avx: mov rdx,r8 + $L$avx_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -2945,7 +1870,8 @@ $L$avx_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],rax + mov QWORD[152+rsp],rax + movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -4068,7 +2994,8 @@ $L$avx_00_47: mov QWORD[56+rdi],r11 jb NEAR $L$loop_avx - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + vzeroupper movaps xmm6,XMMWORD[((128+32))+rsp] movaps xmm7,XMMWORD[((128+48))+rsp] @@ -4077,16 +3004,24 @@ $L$avx_00_47: movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind @@ -4189,9 +3124,6 @@ ALIGN 4 DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase DD $L$SEH_end_sha512_block_data_order wrt ..imagebase DD $L$SEH_info_sha512_block_data_order wrt ..imagebase - DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase @@ -4201,10 +3133,6 @@ $L$SEH_info_sha512_block_data_order: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase -$L$SEH_info_sha512_block_data_order_xop: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase $L$SEH_info_sha512_block_data_order_avx: DB 9,0,0,0 DD se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm index 3edde9fdbc..ccfc870a66 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -23,6 +30,7 @@ section .text code align=64 ALIGN 16 _vpaes_encrypt_core: + mov r9,rdx mov r11,16 mov eax,DWORD[240+rdx] @@ -111,8 +119,185 @@ DB 102,15,56,0,193 + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core_2x: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + movdqa xmm8,xmm2 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + movdqu xmm5,XMMWORD[r9] + + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,208 +DB 102,68,15,56,0,198 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + movdqa xmm6,xmm0 +DB 102,15,56,0,193 +DB 102,15,56,0,247 + pxor xmm2,xmm5 + pxor xmm8,xmm5 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc2x_entry + +ALIGN 16 +$L$enc2x_loop: + + movdqa xmm4,XMMWORD[$L$k_sb1] + movdqa xmm0,XMMWORD[(($L$k_sb1+16))] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + movdqa xmm5,XMMWORD[$L$k_sb2] + movdqa xmm13,xmm5 + pxor xmm0,xmm4 + pxor xmm6,xmm12 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + +DB 102,15,56,0,234 +DB 102,69,15,56,0,232 + movdqa xmm4,XMMWORD[r10*1+r11] + + movdqa xmm2,XMMWORD[(($L$k_sb2+16))] + movdqa xmm8,xmm2 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm0 + movdqa xmm11,xmm6 + pxor xmm2,xmm5 + pxor xmm8,xmm13 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 +DB 102,15,56,0,220 +DB 102,68,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 + pxor xmm11,xmm6 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + pxor xmm6,xmm11 + +$L$enc2x_entry: + + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm5,XMMWORD[(($L$k_inv+16))] + movdqa xmm13,xmm5 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,232 +DB 102,68,15,56,0,238 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm0,xmm1 + pxor xmm6,xmm7 +DB 102,15,56,0,217 +DB 102,68,15,56,0,223 + movdqa xmm4,xmm10 + movdqa xmm12,xmm10 + pxor xmm3,xmm5 + pxor xmm11,xmm13 +DB 102,15,56,0,224 +DB 102,68,15,56,0,230 + movdqa xmm2,xmm10 + movdqa xmm8,xmm10 + pxor xmm4,xmm5 + pxor xmm12,xmm13 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm2,xmm0 + pxor xmm8,xmm6 +DB 102,15,56,0,220 +DB 102,69,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + + pxor xmm3,xmm1 + pxor xmm11,xmm7 + jnz NEAR $L$enc2x_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 + pxor xmm4,xmm5 + pxor xmm12,xmm5 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + movdqa xmm1,XMMWORD[64+r10*1+r11] + + pxor xmm0,xmm4 + pxor xmm6,xmm12 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + DB 0F3h,0C3h ;repret + + + + + + + + + ALIGN 16 _vpaes_decrypt_core: + mov r9,rdx mov eax,DWORD[240+rdx] movdqa xmm1,xmm9 @@ -217,6 +402,7 @@ DB 102,15,56,0,194 + ALIGN 16 _vpaes_schedule_core: @@ -224,6 +410,7 @@ _vpaes_schedule_core: + call _vpaes_preheat movdqa xmm8,XMMWORD[$L$k_rcon] movdqu xmm0,XMMWORD[rdi] @@ -402,8 +589,10 @@ $L$schedule_mangle_last_dec: + ALIGN 16 _vpaes_schedule_192_smear: + pshufd xmm1,xmm6,0x80 pshufd xmm0,xmm7,0xFE pxor xmm6,xmm1 @@ -431,11 +620,13 @@ _vpaes_schedule_192_smear: + ALIGN 16 _vpaes_schedule_round: + pxor xmm1,xmm1 DB 102,65,15,58,15,200,15 DB 102,69,15,58,15,192,15 @@ -500,8 +691,10 @@ DB 102,15,56,0,195 + ALIGN 16 _vpaes_schedule_transform: + movdqa xmm1,xmm9 pandn xmm1,xmm0 psrld xmm1,4 @@ -536,10 +729,12 @@ DB 102,15,56,0,193 + ALIGN 16 _vpaes_schedule_mangle: + movdqa xmm4,xmm0 movdqa xmm5,XMMWORD[$L$k_mc_forward] test rcx,rcx @@ -609,6 +804,7 @@ DB 102,15,56,0,217 + global vpaes_set_encrypt_key ALIGN 16 @@ -622,6 +818,12 @@ $L$SEH_begin_vpaes_set_encrypt_key: mov rdx,r8 + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+5))],1 +%endif + lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -658,6 +860,7 @@ $L$enc_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_set_encrypt_key: global vpaes_set_decrypt_key @@ -673,6 +876,7 @@ $L$SEH_begin_vpaes_set_decrypt_key: mov rdx,r8 + lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -714,6 +918,7 @@ $L$dec_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_set_decrypt_key: global vpaes_encrypt @@ -729,6 +934,11 @@ $L$SEH_begin_vpaes_encrypt: mov rdx,r8 + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+4))],1 +%endif lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -760,6 +970,7 @@ $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_encrypt: global vpaes_decrypt @@ -775,6 +986,7 @@ $L$SEH_begin_vpaes_decrypt: mov rdx,r8 + lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -806,6 +1018,7 @@ $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_decrypt: global vpaes_cbc_encrypt @@ -823,6 +1036,7 @@ $L$SEH_begin_vpaes_cbc_encrypt: mov r9,QWORD[48+rsp] + xchg rdx,rcx sub rcx,16 jc NEAR $L$cbc_abort @@ -884,7 +1098,107 @@ $L$cbc_abort: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_cbc_encrypt: +global vpaes_ctr32_encrypt_blocks + +ALIGN 16 +vpaes_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + + xchg rdx,rcx + test rcx,rcx + jz NEAR $L$ctr32_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$ctr32_body: + movdqu xmm0,XMMWORD[r8] + movdqa xmm8,XMMWORD[$L$ctr_add_one] + sub rsi,rdi + call _vpaes_preheat + movdqa xmm6,xmm0 + pshufb xmm6,XMMWORD[$L$rev_ctr] + + test rcx,1 + jz NEAR $L$ctr32_prep_loop + + + + movdqu xmm7,XMMWORD[rdi] + call _vpaes_encrypt_core + pxor xmm0,xmm7 + paddd xmm6,xmm8 + movdqu XMMWORD[rdi*1+rsi],xmm0 + sub rcx,1 + lea rdi,[16+rdi] + jz NEAR $L$ctr32_done + +$L$ctr32_prep_loop: + + + movdqa xmm14,xmm6 + movdqa xmm15,xmm6 + paddd xmm15,xmm8 + +$L$ctr32_loop: + movdqa xmm1,XMMWORD[$L$rev_ctr] + movdqa xmm0,xmm14 + movdqa xmm6,xmm15 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm3,XMMWORD[$L$ctr_add_two] + pxor xmm0,xmm1 + pxor xmm6,xmm2 + paddd xmm14,xmm3 + paddd xmm15,xmm3 + movdqu XMMWORD[rdi*1+rsi],xmm0 + movdqu XMMWORD[16+rdi*1+rsi],xmm6 + sub rcx,2 + lea rdi,[32+rdi] + jnz NEAR $L$ctr32_loop + +$L$ctr32_done: + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$ctr32_epilogue: +$L$ctr32_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_vpaes_ctr32_encrypt_blocks: @@ -894,6 +1208,7 @@ $L$SEH_end_vpaes_cbc_encrypt: ALIGN 16 _vpaes_preheat: + lea r10,[$L$k_s0F] movdqa xmm10,XMMWORD[((-32))+r10] movdqa xmm11,XMMWORD[((-16))+r10] @@ -910,6 +1225,7 @@ _vpaes_preheat: + ALIGN 64 _vpaes_consts: $L$k_inv: @@ -1005,6 +1321,17 @@ $L$k_dsbe: $L$k_dsbo: DQ 0x1387EA537EF94000,0xC7AA6DB9D4943E2D DQ 0x12D7560F93441D00,0xCA4B8159D8C58E9C + + +$L$rev_ctr: + DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 + + +$L$ctr_add_one: + DQ 0x0000000000000000,0x0000000100000000 +$L$ctr_add_two: + DQ 0x0000000000000000,0x0000000200000000 + DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 @@ -1113,6 +1440,10 @@ ALIGN 4 DD $L$SEH_end_vpaes_cbc_encrypt wrt ..imagebase DD $L$SEH_info_vpaes_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase + section .xdata rdata align=8 ALIGN 8 $L$SEH_info_vpaes_set_encrypt_key: @@ -1135,3 +1466,7 @@ $L$SEH_info_vpaes_cbc_encrypt: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$cbc_body wrt ..imagebase,$L$cbc_epilogue wrt ..imagebase +$L$SEH_info_vpaes_ctr32_encrypt_blocks: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm index dd93341d8f..d6d8bdd6d4 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -31,6 +38,8 @@ $L$SEH_begin_bn_mul_mont: jnz NEAR $L$mul_enter cmp r9d,8 jb NEAR $L$mul_enter + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] cmp rdx,rsi jne NEAR $L$mul4x_enter test r9d,7 @@ -222,31 +231,30 @@ $L$inner_enter: xor r14,r14 mov rax,QWORD[rsp] - lea rsi,[rsp] mov r15,r9 - jmp NEAR $L$sub + ALIGN 16 -$L$sub: - sbb rax,QWORD[r14*8+rcx] +$L$sub: sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax - mov rax,QWORD[8+r14*8+rsi] + mov rax,QWORD[8+r14*8+rsp] lea r14,[1+r14] dec r15 jnz NEAR $L$sub sbb rax,0 + mov rbx,-1 + xor rbx,rax xor r14,r14 - and rsi,rax - not rax - mov rcx,rdi - and rcx,rax mov r15,r9 - or rsi,rcx -ALIGN 16 + $L$copy: - mov rax,QWORD[r14*8+rsi] - mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rax + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r9 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy @@ -294,6 +302,9 @@ $L$SEH_begin_bn_mul4x_mont: mov rax,rsp $L$mul4x_enter: + and r11d,0x80100 + cmp r11d,0x80100 + je NEAR $L$mulx4x_enter push rbx push rbp @@ -631,7 +642,6 @@ $L$inner4x: mov rdi,QWORD[16+r9*8+rsp] lea r15,[((-4))+r9] mov rax,QWORD[rsp] - pxor xmm0,xmm0 mov rdx,QWORD[8+rsp] shr r15,2 lea rsi,[rsp] @@ -641,8 +651,7 @@ $L$inner4x: mov rbx,QWORD[16+rsi] mov rbp,QWORD[24+rsi] sbb rdx,QWORD[8+rcx] - jmp NEAR $L$sub4x -ALIGN 16 + $L$sub4x: mov QWORD[r14*8+rdi],rax mov QWORD[8+r14*8+rdi],rdx @@ -669,34 +678,35 @@ $L$sub4x: sbb rax,0 mov QWORD[24+r14*8+rdi],rbp - xor r14,r14 - and rsi,rax - not rax - mov rcx,rdi - and rcx,rax - lea r15,[((-4))+r9] - or rsi,rcx + pxor xmm0,xmm0 +DB 102,72,15,110,224 + pcmpeqd xmm5,xmm5 + pshufd xmm4,xmm4,0 + mov r15,r9 + pxor xmm5,xmm4 shr r15,2 + xor eax,eax - movdqu xmm1,XMMWORD[rsi] - movdqa XMMWORD[rsp],xmm0 - movdqu XMMWORD[rdi],xmm1 jmp NEAR $L$copy4x ALIGN 16 $L$copy4x: - movdqu xmm2,XMMWORD[16+r14*1+rsi] - movdqu xmm1,XMMWORD[32+r14*1+rsi] - movdqa XMMWORD[16+r14*1+rsp],xmm0 - movdqu XMMWORD[16+r14*1+rdi],xmm2 - movdqa XMMWORD[32+r14*1+rsp],xmm0 - movdqu XMMWORD[32+r14*1+rdi],xmm1 - lea r14,[32+r14] + movdqa xmm1,XMMWORD[rax*1+rsp] + movdqu xmm2,XMMWORD[rax*1+rdi] + pand xmm1,xmm4 + pand xmm2,xmm5 + movdqa xmm3,XMMWORD[16+rax*1+rsp] + movdqa XMMWORD[rax*1+rsp],xmm0 + por xmm1,xmm2 + movdqu xmm2,XMMWORD[16+rax*1+rdi] + movdqu XMMWORD[rax*1+rdi],xmm1 + pand xmm3,xmm4 + pand xmm2,xmm5 + movdqa XMMWORD[16+rax*1+rsp],xmm0 + por xmm3,xmm2 + movdqu XMMWORD[16+rax*1+rdi],xmm3 + lea rax,[32+rax] dec r15 jnz NEAR $L$copy4x - - movdqu xmm2,XMMWORD[16+r14*1+rsi] - movdqa XMMWORD[16+r14*1+rsp],xmm0 - movdqu XMMWORD[16+r14*1+rdi],xmm2 mov rsi,QWORD[8+r9*8+rsp] mov rax,1 @@ -720,6 +730,7 @@ $L$mul4x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_mul4x_mont: +EXTERN bn_sqrx8x_internal EXTERN bn_sqr8x_internal @@ -815,6 +826,26 @@ DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 + lea rax,[OPENSSL_ia32cap_P] + mov eax,DWORD[8+rax] + and eax,0x80100 + cmp eax,0x80100 + jne NEAR $L$sqr8x_nox + + call bn_sqrx8x_internal + + + + + lea rbx,[rcx*1+r8] + mov r9,rcx + mov rdx,rcx +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_nox: call bn_sqr8x_internal @@ -904,6 +935,376 @@ $L$sqr8x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_sqr8x_mont: + +ALIGN 32 +bn_mulx4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$mulx4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + xor r10,r10 + sub r10,r9 + mov r8,QWORD[r8] + lea rbp,[((-72))+r10*1+rsp] + and rbp,-128 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +ALIGN 16 +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + lea r10,[r9*1+rdx] + + + + + + + + + + + + + mov QWORD[rsp],r9 + shr r9,5 + mov QWORD[16+rsp],r10 + sub r9,1 + mov QWORD[24+rsp],r8 + mov QWORD[32+rsp],rdi + mov QWORD[40+rsp],rax + + mov QWORD[48+rsp],r9 + jmp NEAR $L$mulx4x_body + +ALIGN 32 +$L$mulx4x_body: + lea rdi,[8+rdx] + mov rdx,QWORD[rdx] + lea rbx,[((64+32))+rsp] + mov r9,rdx + + mulx rax,r8,QWORD[rsi] + mulx r14,r11,QWORD[8+rsi] + add r11,rax + mov QWORD[8+rsp],rdi + mulx r13,r12,QWORD[16+rsi] + adc r12,r14 + adc r13,0 + + mov rdi,r8 + imul r8,QWORD[24+rsp] + xor rbp,rbp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx rdi,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 +DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + mov rdi,QWORD[48+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] +DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + add r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + mov rdx,QWORD[rdi] + lea rdi,[8+rdi] + sub rsi,rax + mov QWORD[rbx],r15 + lea rbx,[((64+32))+rsp] + sub rcx,rax + + mulx r11,r8,QWORD[rsi] + xor ebp,ebp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + adox r12,QWORD[((-16))+rbx] + adcx r13,rbp + adox r13,rbp + + mov QWORD[8+rsp],rdi + mov r15,r8 + imul r8,QWORD[24+rsp] + xor ebp,ebp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + adcx r13,rax + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + lea rsi,[32+rsi] + adox r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + lea rcx,[32+rcx] + adcx r12,rax + adox r15,rbp + mov rdi,QWORD[48+rsp] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-32))+rbx],r11 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + sub rbp,QWORD[rbx] + adc r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + + cmp rdi,QWORD[16+rsp] + jne NEAR $L$mulx4x_outer + + lea rbx,[64+rsp] + sub rcx,rax + neg r15 + mov rdx,rax + shr rax,3+2 + mov rdi,QWORD[32+rsp] + jmp NEAR $L$mulx4x_sub + +ALIGN 32 +$L$mulx4x_sub: + mov r11,QWORD[rbx] + mov r12,QWORD[8+rbx] + mov r13,QWORD[16+rbx] + mov r14,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r11,QWORD[rcx] + sbb r12,QWORD[8+rcx] + sbb r13,QWORD[16+rcx] + sbb r14,QWORD[24+rcx] + lea rcx,[32+rcx] + mov QWORD[rdi],r11 + mov QWORD[8+rdi],r12 + mov QWORD[16+rdi],r13 + mov QWORD[24+rdi],r14 + lea rdi,[32+rdi] + dec rax + jnz NEAR $L$mulx4x_sub + + sbb r15,0 + lea rbx,[64+rsp] + sub rdi,rdx + +DB 102,73,15,110,207 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$mulx4x_cond_copy + +ALIGN 32 +$L$mulx4x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + sub rdx,32 + jnz NEAR $L$mulx4x_cond_copy + + mov QWORD[rbx],rdx + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mulx4x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 @@ -1055,6 +1456,9 @@ ALIGN 4 DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_bn_mul_mont: @@ -1070,3 +1474,8 @@ DB 9,0,0,0 DD sqr_handler wrt ..imagebase DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase ALIGN 8 +$L$SEH_info_bn_mulx4x_mont: +DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm index 1bcbc5d097..7a1d5dbd9c 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -29,6 +36,8 @@ $L$SEH_begin_bn_mul_mont_gather5: test r9d,7 jnz NEAR $L$mul_enter + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] jmp NEAR $L$mul4x_enter ALIGN 16 @@ -410,8 +419,7 @@ $L$inner_enter: mov r15,r9 jmp NEAR $L$sub ALIGN 16 -$L$sub: - sbb rax,QWORD[r14*8+rcx] +$L$sub: sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax mov rax,QWORD[8+r14*8+rsi] lea r14,[1+r14] @@ -419,18 +427,19 @@ $L$sub: jnz NEAR $L$sub sbb rax,0 + mov rbx,-1 + xor rbx,rax xor r14,r14 - and rsi,rax - not rax - mov rcx,rdi - and rcx,rax mov r15,r9 - or rsi,rcx -ALIGN 16 + $L$copy: - mov rax,QWORD[r14*8+rsi] + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rax + or rdx,rcx + mov QWORD[r14*8+rdi],rdx lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy @@ -479,6 +488,9 @@ DB 0x67 mov rax,rsp $L$mul4x_enter: + and r11d,0x80108 + cmp r11d,0x80108 + je NEAR $L$mulx4x_enter push rbx push rbp @@ -579,6 +591,7 @@ $L$SEH_end_bn_mul4x_mont_gather5: ALIGN 32 mul4x_internal: + shl r9,5 movd xmm5,DWORD[56+rax] lea rax,[$L$inc] @@ -1101,6 +1114,7 @@ $L$inner4x: mov r15,QWORD[24+rbp] jmp NEAR $L$sqr4x_sub_entry + global bn_power5 ALIGN 32 @@ -1120,6 +1134,11 @@ $L$SEH_begin_bn_power5: mov rax,rsp + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] + and r11d,0x80108 + cmp r11d,0x80108 + je NEAR $L$powerx5_enter push rbx push rbp @@ -1323,6 +1342,7 @@ __bn_sqr8x_internal: + lea rbp,[32+r10] @@ -2028,8 +2048,10 @@ DB 102,73,15,126,217 DB 0F3h,0C3h ;repret + ALIGN 32 __bn_post4x_internal: + mov r12,QWORD[rbp] lea rbx,[r9*1+rdi] mov rcx,r9 @@ -2081,10 +2103,12 @@ $L$sqr4x_sub_entry: neg r9 DB 0F3h,0C3h ;repret + global bn_from_montgomery ALIGN 32 bn_from_montgomery: + test DWORD[48+rsp],7 jz NEAR bn_from_mont8x xor eax,eax @@ -2092,6 +2116,7 @@ bn_from_montgomery: + ALIGN 32 bn_from_mont8x: mov QWORD[8+rsp],rdi ;WIN64 prologue @@ -2217,6 +2242,22 @@ DB 102,72,15,110,209 DB 0x67 mov rbp,rcx DB 102,73,15,110,218 + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] + and r11d,0x80108 + cmp r11d,0x80108 + jne NEAR $L$from_mont_nox + + lea rdi,[r9*1+rax] + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor xmm0,xmm0 + lea rax,[48+rsp] + jmp NEAR $L$from_mont_zero + +ALIGN 32 +$L$from_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2257,10 +2298,1382 @@ $L$from_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_from_mont8x: + +ALIGN 32 +bn_mulx4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$mulx4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mulx4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mulx4xsp_done + +$L$mulx4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mulx4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + + + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$mulx4x_body: + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mulx4x_mont_gather5: + + +ALIGN 32 +mulx4x_internal: + + mov QWORD[8+rsp],r9 + mov r10,r9 + neg r9 + shl r9,5 + neg r10 + lea r13,[128+r9*1+rdx] + shr r9,5+5 + movd xmm5,DWORD[56+rax] + sub r9,1 + lea rax,[$L$inc] + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((24+8))+rsp],r9 + mov QWORD[((56+8))+rsp],rdi + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r10*1+rsp] + lea rdi,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 0x67 + movdqa xmm2,xmm1 +DB 0x67 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 +DB 0x67 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + + pand xmm0,XMMWORD[64+rdi] + pand xmm1,XMMWORD[80+rdi] + pand xmm2,XMMWORD[96+rdi] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+rdi] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+rdi] + movdqa xmm5,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+rdi] + movdqa xmm5,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[rdi] + movdqa xmm5,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + pxor xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + lea rbx,[((64+32+8))+rsp] + + mov r9,rdx + mulx rax,r8,QWORD[rsi] + mulx r12,r11,QWORD[8+rsi] + add r11,rax + mulx r13,rax,QWORD[16+rsi] + adc r12,rax + adc r13,0 + mulx r14,rax,QWORD[24+rsi] + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + xor rbp,rbp + mov rdx,r8 + + mov QWORD[((8+8))+rsp],rdi + + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] +DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[8+rsp] + adc r15,rbp + lea rsi,[rax*1+rsi] + add r14,r15 + mov rdi,QWORD[((8+8))+rsp] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + lea r10,[((16-256))+rbx] + pxor xmm4,xmm4 +DB 0x67,0x67 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+rdi] + movdqa xmm1,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm0,XMMWORD[256+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm1,XMMWORD[272+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[288+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[304+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+rdi] + movdqa xmm1,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm0,XMMWORD[320+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm1,XMMWORD[336+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[352+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[368+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[rdi] + movdqa xmm1,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm0,XMMWORD[384+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm1,XMMWORD[400+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[416+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[432+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+rdi] + movdqa xmm1,XMMWORD[80+rdi] + movdqa xmm2,XMMWORD[96+rdi] + pand xmm0,XMMWORD[448+r10] + movdqa xmm3,XMMWORD[112+rdi] + pand xmm1,XMMWORD[464+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[480+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[496+r10] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + + mov QWORD[rbx],rbp + lea rbx,[32+rax*1+rbx] + mulx r11,r8,QWORD[rsi] + xor rbp,rbp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + mulx r14,rdx,QWORD[24+rsi] + adox r12,QWORD[((-16))+rbx] + adcx r13,rdx + lea rcx,[rax*1+rcx] + lea rsi,[32+rsi] + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + adox r14,rbp + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + + mov rdx,r8 + xor rbp,rbp + mov QWORD[((8+8))+rsp],rdi + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r12,rax + mov QWORD[((-24))+rbx],r11 + adox r15,rbp + mov QWORD[((-16))+rbx],r12 + lea rcx,[32+rcx] + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mov QWORD[((-32))+rbx],r11 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + lea rcx,[32+rcx] + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[((0+8))+rsp] + adc r15,rbp + sub rdi,QWORD[rbx] + mov rdi,QWORD[((8+8))+rsp] + mov r10,QWORD[((16+8))+rsp] + adc r14,r15 + lea rsi,[rax*1+rsi] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + + cmp rdi,r10 + jb NEAR $L$mulx4x_outer + + mov r10,QWORD[((-8))+rcx] + mov r8,rbp + mov r12,QWORD[rax*1+rcx] + lea rbp,[rax*1+rcx] + mov rcx,rax + lea rdi,[rax*1+rbx] + xor eax,eax + xor r15,r15 + sub r10,r14 + adc r15,r15 + or r8,r15 + sar rcx,3+2 + sub rax,r8 + mov rdx,QWORD[((56+8))+rsp] + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + + + +ALIGN 32 +bn_powerx5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_powerx5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$powerx5_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$powerx5_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwrx_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwrx_sp_done + +ALIGN 32 +$L$pwrx_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwrx_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk + jmp NEAR $L$pwrx_page_walk_done + +$L$pwrx_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk +$L$pwrx_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + + + pxor xmm0,xmm0 +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 102,73,15,110,218 +DB 102,72,15,110,226 + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + mov r9,r10 + mov rdi,rsi +DB 102,72,15,126,209 +DB 102,72,15,126,226 + mov rax,QWORD[40+rsp] + + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$powerx5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_powerx5: + +global bn_sqrx8x_internal + + +ALIGN 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rdi,[((48+8))+rsp] + lea rbp,[r9*1+rsi] + mov QWORD[((0+8))+rsp],r9 + mov QWORD[((8+8))+rsp],rbp + jmp NEAR $L$sqr8x_zero_start + +ALIGN 32 +DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +$L$sqrx8x_zero: +DB 0x3e + movdqa XMMWORD[rdi],xmm0 + movdqa XMMWORD[16+rdi],xmm0 + movdqa XMMWORD[32+rdi],xmm0 + movdqa XMMWORD[48+rdi],xmm0 +$L$sqr8x_zero_start: + movdqa XMMWORD[64+rdi],xmm0 + movdqa XMMWORD[80+rdi],xmm0 + movdqa XMMWORD[96+rdi],xmm0 + movdqa XMMWORD[112+rdi],xmm0 + lea rdi,[128+rdi] + sub r9,64 + jnz NEAR $L$sqrx8x_zero + + mov rdx,QWORD[rsi] + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + lea rdi,[((48+8))+rsp] + xor rbp,rbp + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_loop: + mulx rax,r8,QWORD[8+rsi] + adcx r8,r9 + adox r10,rax + mulx rax,r9,QWORD[16+rsi] + adcx r9,r10 + adox r11,rax +DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcx r10,r11 + adox r12,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcx r11,r12 + adox r13,rax + mulx rax,r12,QWORD[40+rsi] + adcx r12,r13 + adox r14,rax + mulx rax,r13,QWORD[48+rsi] + adcx r13,r14 + adox rax,r15 + mulx r15,r14,QWORD[56+rsi] + mov rdx,QWORD[8+rsi] + adcx r14,rax + adox r15,rbp + adc r15,QWORD[64+rdi] + mov QWORD[8+rdi],r8 + mov QWORD[16+rdi],r9 + sbb rcx,rcx + xor rbp,rbp + + + mulx rbx,r8,QWORD[16+rsi] + mulx rax,r9,QWORD[24+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[32+rsi] + adcx r9,r11 + adox r10,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcx r10,r12 + adox r11,rbx +DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcx r11,r13 + adox r12,r14 +DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + mov rdx,QWORD[16+rsi] + adcx r12,rax + adox r13,rbx + adcx r13,r15 + adox r14,rbp + adcx r14,rbp + + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + + mulx rbx,r8,QWORD[24+rsi] + mulx rax,r9,QWORD[32+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[40+rsi] + adcx r9,r11 + adox r10,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcx r10,r12 + adox r11,r13 +DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +DB 0x3e + mov rdx,QWORD[24+rsi] + adcx r11,rbx + adox r12,rax + adcx r12,r14 + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mulx rax,r8,QWORD[32+rsi] + adox r13,rbp + adcx r13,rbp + + mulx rbx,r9,QWORD[40+rsi] + adcx r8,r10 + adox r9,rax + mulx rax,r10,QWORD[48+rsi] + adcx r9,r11 + adox r10,r12 + mulx r12,r11,QWORD[56+rsi] + mov rdx,QWORD[32+rsi] + mov r14,QWORD[40+rsi] + adcx r10,rbx + adox r11,rax + mov r15,QWORD[48+rsi] + adcx r11,r13 + adox r12,rbp + adcx r12,rbp + + mov QWORD[56+rdi],r8 + mov QWORD[64+rdi],r9 + + mulx rax,r9,r14 + mov r8,QWORD[56+rsi] + adcx r9,r10 + mulx rbx,r10,r15 + adox r10,rax + adcx r10,r11 + mulx rax,r11,r8 + mov rdx,r14 + adox r11,rbx + adcx r11,r12 + + adcx rax,rbp + + mulx rbx,r14,r15 + mulx r13,r12,r8 + mov rdx,r15 + lea rsi,[64+rsi] + adcx r11,r14 + adox r12,rbx + adcx r12,rax + adox r13,rbp + +DB 0x67,0x67 + mulx r14,r8,r8 + adcx r13,r8 + adcx r14,rbp + + cmp rsi,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_outer_break + + neg rcx + mov rcx,-8 + mov r15,rbp + mov r8,QWORD[64+rdi] + adcx r9,QWORD[72+rdi] + adcx r10,QWORD[80+rdi] + adcx r11,QWORD[88+rdi] + adc r12,QWORD[96+rdi] + adc r13,QWORD[104+rdi] + adc r14,QWORD[112+rdi] + adc r15,QWORD[120+rdi] + lea rbp,[rsi] + lea rdi,[128+rdi] + sbb rax,rax + + mov rdx,QWORD[((-64))+rsi] + mov QWORD[((16+8))+rsp],rax + mov QWORD[((24+8))+rsp],rdi + + + xor eax,eax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_loop: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + +DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + mov QWORD[rcx*8+rdi],rbx + mov ebx,0 + adcx r13,rax + adox r14,r15 + +DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + mov rdx,QWORD[8+rcx*8+rsi] + adcx r14,rax + adox r15,rbx + adcx r15,rbx + +DB 0x67 + inc rcx + jnz NEAR $L$sqrx8x_loop + + lea rbp,[64+rbp] + mov rcx,-8 + cmp rbp,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_break + + sub rbx,QWORD[((16+8))+rsp] +DB 0x66 + mov rdx,QWORD[((-64))+rsi] + adcx r8,QWORD[rdi] + adcx r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] +DB 0x67 + sbb rax,rax + xor ebx,ebx + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_break: + xor rbp,rbp + sub rbx,QWORD[((16+8))+rsp] + adcx r8,rbp + mov rcx,QWORD[((24+8))+rsp] + adcx r9,rbp + mov rdx,QWORD[rsi] + adc r10,0 + mov QWORD[rdi],r8 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + cmp rdi,rcx + je NEAR $L$sqrx8x_outer_loop + + mov QWORD[8+rdi],r9 + mov r9,QWORD[8+rcx] + mov QWORD[16+rdi],r10 + mov r10,QWORD[16+rcx] + mov QWORD[24+rdi],r11 + mov r11,QWORD[24+rcx] + mov QWORD[32+rdi],r12 + mov r12,QWORD[32+rcx] + mov QWORD[40+rdi],r13 + mov r13,QWORD[40+rcx] + mov QWORD[48+rdi],r14 + mov r14,QWORD[48+rcx] + mov QWORD[56+rdi],r15 + mov r15,QWORD[56+rcx] + mov rdi,rcx + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_break: + mov QWORD[72+rdi],r9 +DB 102,72,15,126,217 + mov QWORD[80+rdi],r10 + mov QWORD[88+rdi],r11 + mov QWORD[96+rdi],r12 + mov QWORD[104+rdi],r13 + mov QWORD[112+rdi],r14 + lea rdi,[((48+8))+rsp] + mov rdx,QWORD[rcx*1+rsi] + + mov r11,QWORD[8+rdi] + xor r10,r10 + mov r9,QWORD[((0+8))+rsp] + adox r11,r11 + mov r12,QWORD[16+rdi] + mov r13,QWORD[24+rdi] + + +ALIGN 32 +$L$sqrx4x_shift_n_add: + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 +DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[40+rdi] + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + mov rdx,QWORD[16+rcx*1+rsi] + mov r12,QWORD[48+rdi] + adox r11,r11 + adcx rbx,r13 + mov r13,QWORD[56+rdi] + mov QWORD[16+rdi],rax + mov QWORD[24+rdi],rbx + + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + mov rdx,QWORD[24+rcx*1+rsi] + lea rcx,[32+rcx] + mov r10,QWORD[64+rdi] + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[72+rdi] + mov QWORD[32+rdi],rax + mov QWORD[40+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + jrcxz $L$sqrx4x_shift_n_add_break +DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adox r11,r11 + adcx rbx,r13 + mov r12,QWORD[80+rdi] + mov r13,QWORD[88+rdi] + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] + nop + jmp NEAR $L$sqrx4x_shift_n_add + +ALIGN 32 +$L$sqrx4x_shift_n_add_break: + adcx rbx,r13 + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] +DB 102,72,15,126,213 +__bn_sqrx8x_reduction: + xor eax,eax + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rcx,[((-64))+r9*1+rbp] + + mov QWORD[((0+8))+rsp],rcx + mov QWORD[((8+8))+rsp],rdi + + lea rdi,[((48+8))+rsp] + jmp NEAR $L$sqrx8x_reduction_loop + +ALIGN 32 +$L$sqrx8x_reduction_loop: + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r8,rdx + imul rdx,rbx + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[((24+8))+rsp],rax + + lea rdi,[64+rdi] + xor rsi,rsi + mov rcx,-8 + jmp NEAR $L$sqrx8x_reduce + +ALIGN 32 +$L$sqrx8x_reduce: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rax,rbx + adox r8,r9 + + mulx r9,rbx,QWORD[8+rbp] + adcx r8,rbx + adox r9,r10 + + mulx r10,rbx,QWORD[16+rbp] + adcx r9,rbx + adox r10,r11 + + mulx r11,rbx,QWORD[24+rbp] + adcx r10,rbx + adox r11,r12 + +DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + mov rax,rdx + mov rdx,r8 + adcx r11,rbx + adox r12,r13 + + mulx rdx,rbx,QWORD[((32+8))+rsp] + mov rdx,rax + mov QWORD[((64+48+8))+rcx*8+rsp],rax + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,rbx + adcx r14,rax + adox r15,rsi + adcx r15,rsi + +DB 0x67,0x67,0x67 + inc rcx + jnz NEAR $L$sqrx8x_reduce + + mov rax,rsi + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_no_tail + + mov rdx,QWORD[((48+8))+rsp] + add r8,QWORD[rdi] + lea rbp,[64+rbp] + mov rcx,-8 + adcx r9,QWORD[8+rdi] + adcx r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + +DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,QWORD[((72+48+8))+rcx*8+rsp] + adcx r14,rax + adox r15,rsi + mov QWORD[rcx*8+rdi],rbx + mov rbx,r8 + adcx r15,rsi + + inc rcx + jnz NEAR $L$sqrx8x_tail + + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_tail_done + + sub rsi,QWORD[((16+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rbp,[64+rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + sub rcx,8 + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail_done: + xor rax,rax + add r8,QWORD[((24+8))+rsp] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + sub rsi,QWORD[((16+8))+rsp] +$L$sqrx8x_no_tail: + adc r8,QWORD[rdi] +DB 102,72,15,126,217 + adc r9,QWORD[8+rdi] + mov rsi,QWORD[56+rbp] +DB 102,72,15,126,213 + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[64+rcx*1+rdi] + + mov QWORD[rdi],r8 + lea r8,[64+rdi] + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + lea rdi,[64+rcx*1+rdi] + cmp r8,QWORD[((8+8))+rsp] + jb NEAR $L$sqrx8x_reduction_loop + DB 0F3h,0C3h ;repret + + +ALIGN 32 + +__bn_postx4x_internal: + + mov r12,QWORD[rbp] + mov r10,rcx + mov r9,rcx + neg rax + sar rcx,3+2 + +DB 102,72,15,126,202 +DB 102,72,15,126,206 + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + +ALIGN 16 +$L$sqrx4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqrx4x_sub_entry: + andn r12,r12,rax + lea rbp,[32+rbp] + andn r13,r13,rax + andn r14,r14,rax + andn r15,r15,rax + + neg r8 + adc r12,QWORD[rdi] + adc r13,QWORD[8+rdi] + adc r14,QWORD[16+rdi] + adc r15,QWORD[24+rdi] + mov QWORD[rdx],r12 + lea rdi,[32+rdi] + mov QWORD[8+rdx],r13 + sbb r8,r8 + mov QWORD[16+rdx],r14 + mov QWORD[24+rdx],r15 + lea rdx,[32+rdx] + + inc rcx + jnz NEAR $L$sqrx4x_sub + + neg r9 + + DB 0F3h,0C3h ;repret + + global bn_scatter5 ALIGN 16 bn_scatter5: + cmp edx,0 jz NEAR $L$scatter_epilogue lea r8,[r9*8+r8] @@ -2275,13 +3688,16 @@ $L$scatter_epilogue: DB 0F3h,0C3h ;repret + global bn_gather5 ALIGN 32 bn_gather5: + $L$SEH_begin_bn_gather5: DB 0x4c,0x8d,0x14,0x24 + DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 lea rax,[$L$inc] and rsp,-16 @@ -2435,9 +3851,11 @@ $L$gather: jnz NEAR $L$gather lea rsp,[r10] + DB 0F3h,0C3h ;repret $L$SEH_end_bn_gather5: + ALIGN 64 $L$inc: DD 0,0,1,1 @@ -2568,6 +3986,13 @@ ALIGN 4 DD $L$SEH_begin_bn_from_mont8x wrt ..imagebase DD $L$SEH_end_bn_from_mont8x wrt ..imagebase DD $L$SEH_info_bn_from_mont8x wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_powerx5 wrt ..imagebase + DD $L$SEH_end_bn_powerx5 wrt ..imagebase + DD $L$SEH_info_bn_powerx5 wrt ..imagebase DD $L$SEH_begin_bn_gather5 wrt ..imagebase DD $L$SEH_end_bn_gather5 wrt ..imagebase DD $L$SEH_info_bn_gather5 wrt ..imagebase @@ -2594,6 +4019,16 @@ DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase ALIGN 8 +$L$SEH_info_bn_mulx4x_mont_gather5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_powerx5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase +ALIGN 8 $L$SEH_info_bn_gather5: DB 0x01,0x0b,0x03,0x0a DB 0x0b,0x01,0x21,0x00 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm new file mode 100644 index 0000000000..99006695ad --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm @@ -0,0 +1,682 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + + + + + + + + +global abi_test_trampoline +ALIGN 16 +abi_test_trampoline: +$L$abi_test_trampoline_seh_begin: + + + + + + + + + + + sub rsp,344 + +$L$abi_test_trampoline_seh_prolog_alloc: + mov QWORD[112+rsp],rbx + +$L$abi_test_trampoline_seh_prolog_rbx: + mov QWORD[120+rsp],rbp + +$L$abi_test_trampoline_seh_prolog_rbp: + mov QWORD[128+rsp],rdi + +$L$abi_test_trampoline_seh_prolog_rdi: + mov QWORD[136+rsp],rsi + +$L$abi_test_trampoline_seh_prolog_rsi: + mov QWORD[144+rsp],r12 + +$L$abi_test_trampoline_seh_prolog_r12: + mov QWORD[152+rsp],r13 + +$L$abi_test_trampoline_seh_prolog_r13: + mov QWORD[160+rsp],r14 + +$L$abi_test_trampoline_seh_prolog_r14: + mov QWORD[168+rsp],r15 + +$L$abi_test_trampoline_seh_prolog_r15: + movdqa XMMWORD[176+rsp],xmm6 + +$L$abi_test_trampoline_seh_prolog_xmm6: + movdqa XMMWORD[192+rsp],xmm7 + +$L$abi_test_trampoline_seh_prolog_xmm7: + movdqa XMMWORD[208+rsp],xmm8 + +$L$abi_test_trampoline_seh_prolog_xmm8: + movdqa XMMWORD[224+rsp],xmm9 + +$L$abi_test_trampoline_seh_prolog_xmm9: + movdqa XMMWORD[240+rsp],xmm10 + +$L$abi_test_trampoline_seh_prolog_xmm10: + movdqa XMMWORD[256+rsp],xmm11 + +$L$abi_test_trampoline_seh_prolog_xmm11: + movdqa XMMWORD[272+rsp],xmm12 + +$L$abi_test_trampoline_seh_prolog_xmm12: + movdqa XMMWORD[288+rsp],xmm13 + +$L$abi_test_trampoline_seh_prolog_xmm13: + movdqa XMMWORD[304+rsp],xmm14 + +$L$abi_test_trampoline_seh_prolog_xmm14: + movdqa XMMWORD[320+rsp],xmm15 + +$L$abi_test_trampoline_seh_prolog_xmm15: +$L$abi_test_trampoline_seh_prolog_end: + mov rbx,QWORD[rdx] + mov rbp,QWORD[8+rdx] + mov rdi,QWORD[16+rdx] + mov rsi,QWORD[24+rdx] + mov r12,QWORD[32+rdx] + mov r13,QWORD[40+rdx] + mov r14,QWORD[48+rdx] + mov r15,QWORD[56+rdx] + movdqa xmm6,XMMWORD[64+rdx] + movdqa xmm7,XMMWORD[80+rdx] + movdqa xmm8,XMMWORD[96+rdx] + movdqa xmm9,XMMWORD[112+rdx] + movdqa xmm10,XMMWORD[128+rdx] + movdqa xmm11,XMMWORD[144+rdx] + movdqa xmm12,XMMWORD[160+rdx] + movdqa xmm13,XMMWORD[176+rdx] + movdqa xmm14,XMMWORD[192+rdx] + movdqa xmm15,XMMWORD[208+rdx] + + mov QWORD[88+rsp],rcx + mov QWORD[96+rsp],rdx + + + + + mov r10,r8 + mov r11,r9 + dec r11 + js NEAR $L$args_done + mov rcx,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov rdx,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov r8,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov r9,QWORD[r10] + add r10,8 + lea rax,[32+rsp] +$L$args_loop: + dec r11 + js NEAR $L$args_done + + + + + + + mov QWORD[104+rsp],r11 + mov r11,QWORD[r10] + mov QWORD[rax],r11 + mov r11,QWORD[104+rsp] + + add r10,8 + add rax,8 + jmp NEAR $L$args_loop + +$L$args_done: + mov rax,QWORD[88+rsp] + mov r10,QWORD[384+rsp] + test r10,r10 + jz NEAR $L$no_unwind + + + pushfq + or QWORD[rsp],0x100 + popfq + + + + nop +global abi_test_unwind_start +abi_test_unwind_start: + + call rax +global abi_test_unwind_return +abi_test_unwind_return: + + + + + pushfq + and QWORD[rsp],-0x101 + popfq +global abi_test_unwind_stop +abi_test_unwind_stop: + + jmp NEAR $L$call_done + +$L$no_unwind: + call rax + +$L$call_done: + + mov rdx,QWORD[96+rsp] + mov QWORD[rdx],rbx + mov QWORD[8+rdx],rbp + mov QWORD[16+rdx],rdi + mov QWORD[24+rdx],rsi + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],r14 + mov QWORD[56+rdx],r15 + movdqa XMMWORD[64+rdx],xmm6 + movdqa XMMWORD[80+rdx],xmm7 + movdqa XMMWORD[96+rdx],xmm8 + movdqa XMMWORD[112+rdx],xmm9 + movdqa XMMWORD[128+rdx],xmm10 + movdqa XMMWORD[144+rdx],xmm11 + movdqa XMMWORD[160+rdx],xmm12 + movdqa XMMWORD[176+rdx],xmm13 + movdqa XMMWORD[192+rdx],xmm14 + movdqa XMMWORD[208+rdx],xmm15 + mov rbx,QWORD[112+rsp] + + mov rbp,QWORD[120+rsp] + + mov rdi,QWORD[128+rsp] + + mov rsi,QWORD[136+rsp] + + mov r12,QWORD[144+rsp] + + mov r13,QWORD[152+rsp] + + mov r14,QWORD[160+rsp] + + mov r15,QWORD[168+rsp] + + movdqa xmm6,XMMWORD[176+rsp] + + movdqa xmm7,XMMWORD[192+rsp] + + movdqa xmm8,XMMWORD[208+rsp] + + movdqa xmm9,XMMWORD[224+rsp] + + movdqa xmm10,XMMWORD[240+rsp] + + movdqa xmm11,XMMWORD[256+rsp] + + movdqa xmm12,XMMWORD[272+rsp] + + movdqa xmm13,XMMWORD[288+rsp] + + movdqa xmm14,XMMWORD[304+rsp] + + movdqa xmm15,XMMWORD[320+rsp] + + add rsp,344 + + + + DB 0F3h,0C3h ;repret + +$L$abi_test_trampoline_seh_end: + + +global abi_test_clobber_rax +ALIGN 16 +abi_test_clobber_rax: + xor rax,rax + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rbx +ALIGN 16 +abi_test_clobber_rbx: + xor rbx,rbx + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rcx +ALIGN 16 +abi_test_clobber_rcx: + xor rcx,rcx + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rdx +ALIGN 16 +abi_test_clobber_rdx: + xor rdx,rdx + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rdi +ALIGN 16 +abi_test_clobber_rdi: + xor rdi,rdi + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rsi +ALIGN 16 +abi_test_clobber_rsi: + xor rsi,rsi + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rbp +ALIGN 16 +abi_test_clobber_rbp: + xor rbp,rbp + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r8 +ALIGN 16 +abi_test_clobber_r8: + xor r8,r8 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r9 +ALIGN 16 +abi_test_clobber_r9: + xor r9,r9 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r10 +ALIGN 16 +abi_test_clobber_r10: + xor r10,r10 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r11 +ALIGN 16 +abi_test_clobber_r11: + xor r11,r11 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r12 +ALIGN 16 +abi_test_clobber_r12: + xor r12,r12 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r13 +ALIGN 16 +abi_test_clobber_r13: + xor r13,r13 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r14 +ALIGN 16 +abi_test_clobber_r14: + xor r14,r14 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r15 +ALIGN 16 +abi_test_clobber_r15: + xor r15,r15 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm0 +ALIGN 16 +abi_test_clobber_xmm0: + pxor xmm0,xmm0 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm1 +ALIGN 16 +abi_test_clobber_xmm1: + pxor xmm1,xmm1 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm2 +ALIGN 16 +abi_test_clobber_xmm2: + pxor xmm2,xmm2 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm3 +ALIGN 16 +abi_test_clobber_xmm3: + pxor xmm3,xmm3 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm4 +ALIGN 16 +abi_test_clobber_xmm4: + pxor xmm4,xmm4 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm5 +ALIGN 16 +abi_test_clobber_xmm5: + pxor xmm5,xmm5 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm6 +ALIGN 16 +abi_test_clobber_xmm6: + pxor xmm6,xmm6 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm7 +ALIGN 16 +abi_test_clobber_xmm7: + pxor xmm7,xmm7 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm8 +ALIGN 16 +abi_test_clobber_xmm8: + pxor xmm8,xmm8 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm9 +ALIGN 16 +abi_test_clobber_xmm9: + pxor xmm9,xmm9 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm10 +ALIGN 16 +abi_test_clobber_xmm10: + pxor xmm10,xmm10 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm11 +ALIGN 16 +abi_test_clobber_xmm11: + pxor xmm11,xmm11 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm12 +ALIGN 16 +abi_test_clobber_xmm12: + pxor xmm12,xmm12 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm13 +ALIGN 16 +abi_test_clobber_xmm13: + pxor xmm13,xmm13 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm14 +ALIGN 16 +abi_test_clobber_xmm14: + pxor xmm14,xmm14 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm15 +ALIGN 16 +abi_test_clobber_xmm15: + pxor xmm15,xmm15 + DB 0F3h,0C3h ;repret + + + + + +global abi_test_bad_unwind_wrong_register +ALIGN 16 +abi_test_bad_unwind_wrong_register: + +$L$abi_test_bad_unwind_wrong_register_seh_begin: + push r12 + +$L$abi_test_bad_unwind_wrong_register_seh_push_r13: + + + + nop + pop r12 + + DB 0F3h,0C3h ;repret +$L$abi_test_bad_unwind_wrong_register_seh_end: + + + + + + + +global abi_test_bad_unwind_temporary +ALIGN 16 +abi_test_bad_unwind_temporary: + +$L$abi_test_bad_unwind_temporary_seh_begin: + push r12 + +$L$abi_test_bad_unwind_temporary_seh_push_r12: + + mov rax,r12 + inc rax + mov QWORD[rsp],rax + + + + mov QWORD[rsp],r12 + + + pop r12 + + DB 0F3h,0C3h ;repret +$L$abi_test_bad_unwind_temporary_seh_end: + + + + + + + +global abi_test_get_and_clear_direction_flag +abi_test_get_and_clear_direction_flag: + pushfq + pop rax + and rax,0x400 + shr rax,10 + cld + DB 0F3h,0C3h ;repret + + + + + +global abi_test_set_direction_flag +abi_test_set_direction_flag: + std + DB 0F3h,0C3h ;repret + + + + + + +global abi_test_bad_unwind_epilog +ALIGN 16 +abi_test_bad_unwind_epilog: +$L$abi_test_bad_unwind_epilog_seh_begin: + push r12 +$L$abi_test_bad_unwind_epilog_seh_push_r12: + + nop + + + pop r12 + nop + DB 0F3h,0C3h ;repret +$L$abi_test_bad_unwind_epilog_seh_end: + +section .pdata rdata align=4 +ALIGN 4 + + DD $L$abi_test_trampoline_seh_begin wrt ..imagebase + DD $L$abi_test_trampoline_seh_end wrt ..imagebase + DD $L$abi_test_trampoline_seh_info wrt ..imagebase + + DD $L$abi_test_bad_unwind_wrong_register_seh_begin wrt ..imagebase + DD $L$abi_test_bad_unwind_wrong_register_seh_end wrt ..imagebase + DD $L$abi_test_bad_unwind_wrong_register_seh_info wrt ..imagebase + + DD $L$abi_test_bad_unwind_temporary_seh_begin wrt ..imagebase + DD $L$abi_test_bad_unwind_temporary_seh_end wrt ..imagebase + DD $L$abi_test_bad_unwind_temporary_seh_info wrt ..imagebase + + DD $L$abi_test_bad_unwind_epilog_seh_begin wrt ..imagebase + DD $L$abi_test_bad_unwind_epilog_seh_end wrt ..imagebase + DD $L$abi_test_bad_unwind_epilog_seh_info wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$abi_test_trampoline_seh_info: + +DB 1 +DB $L$abi_test_trampoline_seh_prolog_end-$L$abi_test_trampoline_seh_begin +DB 38 +DB 0 +DB $L$abi_test_trampoline_seh_prolog_xmm15-$L$abi_test_trampoline_seh_begin +DB 248 + DW 20 +DB $L$abi_test_trampoline_seh_prolog_xmm14-$L$abi_test_trampoline_seh_begin +DB 232 + DW 19 +DB $L$abi_test_trampoline_seh_prolog_xmm13-$L$abi_test_trampoline_seh_begin +DB 216 + DW 18 +DB $L$abi_test_trampoline_seh_prolog_xmm12-$L$abi_test_trampoline_seh_begin +DB 200 + DW 17 +DB $L$abi_test_trampoline_seh_prolog_xmm11-$L$abi_test_trampoline_seh_begin +DB 184 + DW 16 +DB $L$abi_test_trampoline_seh_prolog_xmm10-$L$abi_test_trampoline_seh_begin +DB 168 + DW 15 +DB $L$abi_test_trampoline_seh_prolog_xmm9-$L$abi_test_trampoline_seh_begin +DB 152 + DW 14 +DB $L$abi_test_trampoline_seh_prolog_xmm8-$L$abi_test_trampoline_seh_begin +DB 136 + DW 13 +DB $L$abi_test_trampoline_seh_prolog_xmm7-$L$abi_test_trampoline_seh_begin +DB 120 + DW 12 +DB $L$abi_test_trampoline_seh_prolog_xmm6-$L$abi_test_trampoline_seh_begin +DB 104 + DW 11 +DB $L$abi_test_trampoline_seh_prolog_r15-$L$abi_test_trampoline_seh_begin +DB 244 + DW 21 +DB $L$abi_test_trampoline_seh_prolog_r14-$L$abi_test_trampoline_seh_begin +DB 228 + DW 20 +DB $L$abi_test_trampoline_seh_prolog_r13-$L$abi_test_trampoline_seh_begin +DB 212 + DW 19 +DB $L$abi_test_trampoline_seh_prolog_r12-$L$abi_test_trampoline_seh_begin +DB 196 + DW 18 +DB $L$abi_test_trampoline_seh_prolog_rsi-$L$abi_test_trampoline_seh_begin +DB 100 + DW 17 +DB $L$abi_test_trampoline_seh_prolog_rdi-$L$abi_test_trampoline_seh_begin +DB 116 + DW 16 +DB $L$abi_test_trampoline_seh_prolog_rbp-$L$abi_test_trampoline_seh_begin +DB 84 + DW 15 +DB $L$abi_test_trampoline_seh_prolog_rbx-$L$abi_test_trampoline_seh_begin +DB 52 + DW 14 +DB $L$abi_test_trampoline_seh_prolog_alloc-$L$abi_test_trampoline_seh_begin +DB 1 + DW 43 + + +ALIGN 8 +$L$abi_test_bad_unwind_wrong_register_seh_info: +DB 1 +DB $L$abi_test_bad_unwind_wrong_register_seh_push_r13-$L$abi_test_bad_unwind_wrong_register_seh_begin +DB 1 +DB 0 + +DB $L$abi_test_bad_unwind_wrong_register_seh_push_r13-$L$abi_test_bad_unwind_wrong_register_seh_begin +DB 208 + +ALIGN 8 +$L$abi_test_bad_unwind_temporary_seh_info: +DB 1 +DB $L$abi_test_bad_unwind_temporary_seh_push_r12-$L$abi_test_bad_unwind_temporary_seh_begin +DB 1 +DB 0 + +DB $L$abi_test_bad_unwind_temporary_seh_push_r12-$L$abi_test_bad_unwind_temporary_seh_begin +DB 192 + +ALIGN 8 +$L$abi_test_bad_unwind_epilog_seh_info: +DB 1 +DB $L$abi_test_bad_unwind_epilog_seh_push_r12-$L$abi_test_bad_unwind_epilog_seh_begin +DB 1 +DB 0 + +DB $L$abi_test_bad_unwind_epilog_seh_push_r12-$L$abi_test_bad_unwind_epilog_seh_begin +DB 192 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm new file mode 100644 index 0000000000..fbfef1be13 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm @@ -0,0 +1,1951 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + +$L$p434x2: + DQ 0xFFFFFFFFFFFFFFFE + DQ 0xFFFFFFFFFFFFFFFF + DQ 0xFB82ECF5C5FFFFFF + DQ 0xF78CB8F062B15D47 + DQ 0xD9F8BFAD038A40AC + DQ 0x0004683E4E2EE688 + + +$L$p434p1: + DQ 0xFDC1767AE3000000 + DQ 0x7BC65C783158AEA3 + DQ 0x6CFC5FD681C52056 + DQ 0x0002341F27177344 + +EXTERN OPENSSL_ia32cap_P + +global sike_fpadd + +sike_fpadd: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_fpadd: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + + xor rax,rax + + mov r8,QWORD[rdi] + add r8,QWORD[rsi] + mov r9,QWORD[8+rdi] + adc r9,QWORD[8+rsi] + mov r10,QWORD[16+rdi] + adc r10,QWORD[16+rsi] + mov r11,QWORD[24+rdi] + adc r11,QWORD[24+rsi] + mov r12,QWORD[32+rdi] + adc r12,QWORD[32+rsi] + mov r13,QWORD[40+rdi] + adc r13,QWORD[40+rsi] + mov r14,QWORD[48+rdi] + adc r14,QWORD[48+rsi] + + mov rcx,QWORD[$L$p434x2] + sub r8,rcx + mov rcx,QWORD[((8+$L$p434x2))] + sbb r9,rcx + sbb r10,rcx + mov rcx,QWORD[((16+$L$p434x2))] + sbb r11,rcx + mov rcx,QWORD[((24+$L$p434x2))] + sbb r12,rcx + mov rcx,QWORD[((32+$L$p434x2))] + sbb r13,rcx + mov rcx,QWORD[((40+$L$p434x2))] + sbb r14,rcx + + sbb rax,0 + + mov rdi,QWORD[$L$p434x2] + and rdi,rax + mov rsi,QWORD[((8+$L$p434x2))] + and rsi,rax + mov rcx,QWORD[((16+$L$p434x2))] + and rcx,rax + + add r8,rdi + mov QWORD[rdx],r8 + adc r9,rsi + mov QWORD[8+rdx],r9 + adc r10,rsi + mov QWORD[16+rdx],r10 + adc r11,rcx + mov QWORD[24+rdx],r11 + + setc cl + mov r8,QWORD[((24+$L$p434x2))] + and r8,rax + mov r9,QWORD[((32+$L$p434x2))] + and r9,rax + mov r10,QWORD[((40+$L$p434x2))] + and r10,rax + bt rcx,0 + + adc r12,r8 + mov QWORD[32+rdx],r12 + adc r13,r9 + mov QWORD[40+rdx],r13 + adc r14,r10 + mov QWORD[48+rdx],r14 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_cswap_asm + +sike_cswap_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_cswap_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + + movq xmm3,rdx + + + + + + pshufd xmm3,xmm3,68 + + movdqu xmm0,XMMWORD[rdi] + movdqu xmm1,XMMWORD[rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[rsi],xmm1 + + movdqu xmm0,XMMWORD[16+rdi] + movdqu xmm1,XMMWORD[16+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[16+rsi],xmm1 + + movdqu xmm0,XMMWORD[32+rdi] + movdqu xmm1,XMMWORD[32+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[32+rsi],xmm1 + + movdqu xmm0,XMMWORD[48+rdi] + movdqu xmm1,XMMWORD[48+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[48+rsi],xmm1 + + movdqu xmm0,XMMWORD[64+rdi] + movdqu xmm1,XMMWORD[64+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[64+rsi],xmm1 + + movdqu xmm0,XMMWORD[80+rdi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[80+rdi],xmm0 + movdqu XMMWORD[80+rsi],xmm1 + + movdqu xmm0,XMMWORD[96+rdi] + movdqu xmm1,XMMWORD[96+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[96+rdi],xmm0 + movdqu XMMWORD[96+rsi],xmm1 + + movdqu xmm0,XMMWORD[112+rdi] + movdqu xmm1,XMMWORD[112+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[112+rdi],xmm0 + movdqu XMMWORD[112+rsi],xmm1 + + movdqu xmm0,XMMWORD[128+rdi] + movdqu xmm1,XMMWORD[128+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[128+rdi],xmm0 + movdqu XMMWORD[128+rsi],xmm1 + + movdqu xmm0,XMMWORD[144+rdi] + movdqu xmm1,XMMWORD[144+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[144+rdi],xmm0 + movdqu XMMWORD[144+rsi],xmm1 + + movdqu xmm0,XMMWORD[160+rdi] + movdqu xmm1,XMMWORD[160+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[160+rdi],xmm0 + movdqu XMMWORD[160+rsi],xmm1 + + movdqu xmm0,XMMWORD[176+rdi] + movdqu xmm1,XMMWORD[176+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[176+rdi],xmm0 + movdqu XMMWORD[176+rsi],xmm1 + + movdqu xmm0,XMMWORD[192+rdi] + movdqu xmm1,XMMWORD[192+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[192+rdi],xmm0 + movdqu XMMWORD[192+rsi],xmm1 + + movdqu xmm0,XMMWORD[208+rdi] + movdqu xmm1,XMMWORD[208+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[208+rdi],xmm0 + movdqu XMMWORD[208+rsi],xmm1 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +global sike_fpsub + +sike_fpsub: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_fpsub: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + + xor rax,rax + + mov r8,QWORD[rdi] + sub r8,QWORD[rsi] + mov r9,QWORD[8+rdi] + sbb r9,QWORD[8+rsi] + mov r10,QWORD[16+rdi] + sbb r10,QWORD[16+rsi] + mov r11,QWORD[24+rdi] + sbb r11,QWORD[24+rsi] + mov r12,QWORD[32+rdi] + sbb r12,QWORD[32+rsi] + mov r13,QWORD[40+rdi] + sbb r13,QWORD[40+rsi] + mov r14,QWORD[48+rdi] + sbb r14,QWORD[48+rsi] + + sbb rax,0x0 + + mov rdi,QWORD[$L$p434x2] + and rdi,rax + mov rsi,QWORD[((8+$L$p434x2))] + and rsi,rax + mov rcx,QWORD[((16+$L$p434x2))] + and rcx,rax + + add r8,rdi + mov QWORD[rdx],r8 + adc r9,rsi + mov QWORD[8+rdx],r9 + adc r10,rsi + mov QWORD[16+rdx],r10 + adc r11,rcx + mov QWORD[24+rdx],r11 + + setc cl + mov r8,QWORD[((24+$L$p434x2))] + and r8,rax + mov r9,QWORD[((32+$L$p434x2))] + and r9,rax + mov r10,QWORD[((40+$L$p434x2))] + and r10,rax + bt rcx,0x0 + + adc r12,r8 + adc r13,r9 + adc r14,r10 + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],r14 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_mpadd_asm + +sike_mpadd_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpadd_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + mov r8,QWORD[rdi]; + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov rcx,QWORD[32+rdi] + add r8,QWORD[rsi] + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + adc r11,QWORD[24+rsi] + adc rcx,QWORD[32+rsi] + mov QWORD[rdx],r8 + mov QWORD[8+rdx],r9 + mov QWORD[16+rdx],r10 + mov QWORD[24+rdx],r11 + mov QWORD[32+rdx],rcx + + mov r8,QWORD[40+rdi] + mov r9,QWORD[48+rdi] + adc r8,QWORD[40+rsi] + adc r9,QWORD[48+rsi] + mov QWORD[40+rdx],r8 + mov QWORD[48+rdx],r9 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_mpsubx2_asm + +sike_mpsubx2_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpsubx2_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + xor rax,rax + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov rcx,QWORD[32+rdi] + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + sbb r11,QWORD[24+rsi] + sbb rcx,QWORD[32+rsi] + mov QWORD[rdx],r8 + mov QWORD[8+rdx],r9 + mov QWORD[16+rdx],r10 + mov QWORD[24+rdx],r11 + mov QWORD[32+rdx],rcx + + mov r8,QWORD[40+rdi] + mov r9,QWORD[48+rdi] + mov r10,QWORD[56+rdi] + mov r11,QWORD[64+rdi] + mov rcx,QWORD[72+rdi] + sbb r8,QWORD[40+rsi] + sbb r9,QWORD[48+rsi] + sbb r10,QWORD[56+rsi] + sbb r11,QWORD[64+rsi] + sbb rcx,QWORD[72+rsi] + mov QWORD[40+rdx],r8 + mov QWORD[48+rdx],r9 + mov QWORD[56+rdx],r10 + mov QWORD[64+rdx],r11 + mov QWORD[72+rdx],rcx + + mov r8,QWORD[80+rdi] + mov r9,QWORD[88+rdi] + mov r10,QWORD[96+rdi] + mov r11,QWORD[104+rdi] + sbb r8,QWORD[80+rsi] + sbb r9,QWORD[88+rsi] + sbb r10,QWORD[96+rsi] + sbb r11,QWORD[104+rsi] + sbb rax,0x0 + mov QWORD[80+rdx],r8 + mov QWORD[88+rdx],r9 + mov QWORD[96+rdx],r10 + mov QWORD[104+rdx],r11 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_mpdblsubx2_asm + +sike_mpdblsubx2_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpdblsubx2_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + + xor rax,rax + + + mov r8,QWORD[rdx] + mov r9,QWORD[8+rdx] + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + mov r12,QWORD[32+rdx] + mov r13,QWORD[40+rdx] + mov rcx,QWORD[48+rdx] + sub r8,QWORD[rdi] + sbb r9,QWORD[8+rdi] + sbb r10,QWORD[16+rdi] + sbb r11,QWORD[24+rdi] + sbb r12,QWORD[32+rdi] + sbb r13,QWORD[40+rdi] + sbb rcx,QWORD[48+rdi] + adc rax,0x0 + + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + sbb r11,QWORD[24+rsi] + sbb r12,QWORD[32+rsi] + sbb r13,QWORD[40+rsi] + sbb rcx,QWORD[48+rsi] + adc rax,0x0 + + + mov QWORD[rdx],r8 + mov QWORD[8+rdx],r9 + mov QWORD[16+rdx],r10 + mov QWORD[24+rdx],r11 + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],rcx + + + mov r8,QWORD[56+rdx] + mov r9,QWORD[64+rdx] + mov r10,QWORD[72+rdx] + mov r11,QWORD[80+rdx] + mov r12,QWORD[88+rdx] + mov r13,QWORD[96+rdx] + mov rcx,QWORD[104+rdx] + + sub r8,rax + sbb r8,QWORD[56+rdi] + sbb r9,QWORD[64+rdi] + sbb r10,QWORD[72+rdi] + sbb r11,QWORD[80+rdi] + sbb r12,QWORD[88+rdi] + sbb r13,QWORD[96+rdi] + sbb rcx,QWORD[104+rdi] + + + sub r8,QWORD[56+rsi] + sbb r9,QWORD[64+rsi] + sbb r10,QWORD[72+rsi] + sbb r11,QWORD[80+rsi] + sbb r12,QWORD[88+rsi] + sbb r13,QWORD[96+rsi] + sbb rcx,QWORD[104+rsi] + + + mov QWORD[56+rdx],r8 + mov QWORD[64+rdx],r9 + mov QWORD[72+rdx],r10 + mov QWORD[80+rdx],r11 + mov QWORD[88+rdx],r12 + mov QWORD[96+rdx],r13 + mov QWORD[104+rdx],rcx + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +$L$rdc_bdw: + + + + + + + + + xor rax,rax + mov rdx,QWORD[((0+0))+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + xor rax,rax + mov rdx,QWORD[((0+8))+rdi] + mulx rcx,r13,QWORD[((0+$L$p434p1))] + adcx r9,r13 + adcx r10,rcx + + mulx r13,rcx,QWORD[((8+$L$p434p1))] + adcx r11,r13 + adox r10,rcx + + mulx r13,rcx,QWORD[((16+$L$p434p1))] + adcx r12,r13 + adox r11,rcx + + mulx r13,rcx,QWORD[((24+$L$p434p1))] + adcx r13,rax + adox r12,rcx + adox r13,rax + + xor rcx,rcx + add r8,QWORD[24+rdi] + adc r9,QWORD[32+rdi] + adc r10,QWORD[40+rdi] + adc r11,QWORD[48+rdi] + adc r12,QWORD[56+rdi] + adc r13,QWORD[64+rdi] + adc rcx,QWORD[72+rdi] + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + mov QWORD[40+rdi],r10 + mov QWORD[48+rdi],r11 + mov QWORD[56+rdi],r12 + mov QWORD[64+rdi],r13 + mov QWORD[72+rdi],rcx + mov r8,QWORD[80+rdi] + mov r9,QWORD[88+rdi] + mov r10,QWORD[96+rdi] + mov r11,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + adc r10,0x0 + adc r11,0x0 + mov QWORD[80+rdi],r8 + mov QWORD[88+rdi],r9 + mov QWORD[96+rdi],r10 + mov QWORD[104+rdi],r11 + + xor rax,rax + mov rdx,QWORD[((16+0))+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + xor rax,rax + mov rdx,QWORD[((16+8))+rdi] + mulx rcx,r13,QWORD[((0+$L$p434p1))] + adcx r9,r13 + adcx r10,rcx + + mulx r13,rcx,QWORD[((8+$L$p434p1))] + adcx r11,r13 + adox r10,rcx + + mulx r13,rcx,QWORD[((16+$L$p434p1))] + adcx r12,r13 + adox r11,rcx + + mulx r13,rcx,QWORD[((24+$L$p434p1))] + adcx r13,rax + adox r12,rcx + adox r13,rax + + xor rcx,rcx + add r8,QWORD[40+rdi] + adc r9,QWORD[48+rdi] + adc r10,QWORD[56+rdi] + adc r11,QWORD[64+rdi] + adc r12,QWORD[72+rdi] + adc r13,QWORD[80+rdi] + adc rcx,QWORD[88+rdi] + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mov QWORD[56+rdi],r10 + mov QWORD[64+rdi],r11 + mov QWORD[72+rdi],r12 + mov QWORD[80+rdi],r13 + mov QWORD[88+rdi],rcx + mov r8,QWORD[96+rdi] + mov r9,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + mov QWORD[96+rdi],r8 + mov QWORD[104+rdi],r9 + + xor rax,rax + mov rdx,QWORD[((32+0))+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + xor rax,rax + mov rdx,QWORD[((32+8))+rdi] + mulx rcx,r13,QWORD[((0+$L$p434p1))] + adcx r9,r13 + adcx r10,rcx + + mulx r13,rcx,QWORD[((8+$L$p434p1))] + adcx r11,r13 + adox r10,rcx + + mulx r13,rcx,QWORD[((16+$L$p434p1))] + adcx r12,r13 + adox r11,rcx + + mulx r13,rcx,QWORD[((24+$L$p434p1))] + adcx r13,rax + adox r12,rcx + adox r13,rax + + xor rcx,rcx + add r8,QWORD[56+rdi] + adc r9,QWORD[64+rdi] + adc r10,QWORD[72+rdi] + adc r11,QWORD[80+rdi] + adc r12,QWORD[88+rdi] + adc r13,QWORD[96+rdi] + adc rcx,QWORD[104+rdi] + mov QWORD[rsi],r8 + mov QWORD[8+rsi],r9 + mov QWORD[72+rdi],r10 + mov QWORD[80+rdi],r11 + mov QWORD[88+rdi],r12 + mov QWORD[96+rdi],r13 + mov QWORD[104+rdi],rcx + + xor rax,rax + mov rdx,QWORD[48+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + add r8,QWORD[72+rdi] + adc r9,QWORD[80+rdi] + adc r10,QWORD[88+rdi] + adc r11,QWORD[96+rdi] + adc r12,QWORD[104+rdi] + mov QWORD[16+rsi],r8 + mov QWORD[24+rsi],r9 + mov QWORD[32+rsi],r10 + mov QWORD[40+rsi],r11 + mov QWORD[48+rsi],r12 + + + pop r15 + + + pop r14 + + + pop r13 + + + pop r12 + + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_fprdc + +sike_fprdc: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_fprdc: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + push r15 + + + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$rdc_bdw + + + + + mov r14,QWORD[((0+0))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul r14 + xor r10,r10 + mov r8,rax + mov r9,rdx + + + mov rax,QWORD[((8+$L$p434p1))] + mul r14 + xor r11,r11 + add r9,rax + adc r10,rdx + + + mov rcx,QWORD[((0+8))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul rcx + add r9,rax + adc r10,rdx + adc r11,0x0 + + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r14 + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((8+$L$p434p1))] + mul rcx + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul r14 + xor r13,r13 + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((16+$L$p434p1))] + mul rcx + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul rcx + add r12,rax + adc r13,rdx + + + xor rcx,rcx + add r8,QWORD[24+rdi] + adc r9,QWORD[32+rdi] + adc r10,QWORD[40+rdi] + adc r11,QWORD[48+rdi] + adc r12,QWORD[56+rdi] + adc r13,QWORD[64+rdi] + adc rcx,QWORD[72+rdi] + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + mov QWORD[40+rdi],r10 + mov QWORD[48+rdi],r11 + mov QWORD[56+rdi],r12 + mov QWORD[64+rdi],r13 + mov QWORD[72+rdi],rcx + mov r8,QWORD[80+rdi] + mov r9,QWORD[88+rdi] + mov r10,QWORD[96+rdi] + mov r11,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + adc r10,0x0 + adc r11,0x0 + mov QWORD[80+rdi],r8 + mov QWORD[88+rdi],r9 + mov QWORD[96+rdi],r10 + mov QWORD[104+rdi],r11 + + + mov r14,QWORD[((16+0))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul r14 + xor r10,r10 + mov r8,rax + mov r9,rdx + + + mov rax,QWORD[((8+$L$p434p1))] + mul r14 + xor r11,r11 + add r9,rax + adc r10,rdx + + + mov rcx,QWORD[((16+8))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul rcx + add r9,rax + adc r10,rdx + adc r11,0x0 + + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r14 + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((8+$L$p434p1))] + mul rcx + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul r14 + xor r13,r13 + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((16+$L$p434p1))] + mul rcx + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul rcx + add r12,rax + adc r13,rdx + + + xor rcx,rcx + add r8,QWORD[40+rdi] + adc r9,QWORD[48+rdi] + adc r10,QWORD[56+rdi] + adc r11,QWORD[64+rdi] + adc r12,QWORD[72+rdi] + adc r13,QWORD[80+rdi] + adc rcx,QWORD[88+rdi] + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mov QWORD[56+rdi],r10 + mov QWORD[64+rdi],r11 + mov QWORD[72+rdi],r12 + mov QWORD[80+rdi],r13 + mov QWORD[88+rdi],rcx + mov r8,QWORD[96+rdi] + mov r9,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + mov QWORD[96+rdi],r8 + mov QWORD[104+rdi],r9 + + + mov r14,QWORD[((32+0))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul r14 + xor r10,r10 + mov r8,rax + mov r9,rdx + + + mov rax,QWORD[((8+$L$p434p1))] + mul r14 + xor r11,r11 + add r9,rax + adc r10,rdx + + + mov rcx,QWORD[((32+8))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul rcx + add r9,rax + adc r10,rdx + adc r11,0x0 + + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r14 + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((8+$L$p434p1))] + mul rcx + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul r14 + xor r13,r13 + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((16+$L$p434p1))] + mul rcx + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul rcx + add r12,rax + adc r13,rdx + + + xor rcx,rcx + add r8,QWORD[56+rdi] + adc r9,QWORD[64+rdi] + adc r10,QWORD[72+rdi] + adc r11,QWORD[80+rdi] + adc r12,QWORD[88+rdi] + adc r13,QWORD[96+rdi] + adc rcx,QWORD[104+rdi] + mov QWORD[rsi],r8 + mov QWORD[8+rsi],r9 + mov QWORD[72+rdi],r10 + mov QWORD[80+rdi],r11 + mov QWORD[88+rdi],r12 + mov QWORD[96+rdi],r13 + mov QWORD[104+rdi],rcx + + mov r13,QWORD[48+rdi] + + xor r10,r10 + mov rax,QWORD[((0+$L$p434p1))] + mul r13 + mov r8,rax + mov r9,rdx + + xor r11,r11 + mov rax,QWORD[((8+$L$p434p1))] + mul r13 + add r9,rax + adc r10,rdx + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r13 + add r10,rax + adc r11,rdx + + mov rax,QWORD[((24+$L$p434p1))] + mul r13 + add r11,rax + adc r12,rdx + + add r8,QWORD[72+rdi] + adc r9,QWORD[80+rdi] + adc r10,QWORD[88+rdi] + adc r11,QWORD[96+rdi] + adc r12,QWORD[104+rdi] + mov QWORD[16+rsi],r8 + mov QWORD[24+rsi],r9 + mov QWORD[32+rsi],r10 + mov QWORD[40+rsi],r11 + mov QWORD[48+rsi],r12 + + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$mul_bdw: + + + + + + + + + + mov rcx,rdx + xor rax,rax + + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + + push rbx + + + push rbp + + + sub rsp,96 + + + add r8,QWORD[32+rdi] + adc r9,QWORD[40+rdi] + adc r10,QWORD[48+rdi] + adc r11,0x0 + sbb rax,0x0 + mov QWORD[rsp],r8 + mov QWORD[8+rsp],r9 + mov QWORD[16+rsp],r10 + mov QWORD[24+rsp],r11 + + + xor rbx,rbx + mov r12,QWORD[rsi] + mov r13,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + add r12,QWORD[32+rsi] + adc r13,QWORD[40+rsi] + adc r14,QWORD[48+rsi] + adc r15,0x0 + sbb rbx,0x0 + mov QWORD[32+rsp],r12 + mov QWORD[40+rsp],r13 + mov QWORD[48+rsp],r14 + mov QWORD[56+rsp],r15 + + + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + + and r8,rbx + and r9,rbx + and r10,rbx + and r11,rbx + + + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + mov QWORD[64+rsp],r8 + mov QWORD[72+rsp],r9 + mov QWORD[80+rsp],r10 + mov QWORD[88+rsp],r11 + + + mov rdx,QWORD[((0+0))+rsp] + mulx r8,r9,QWORD[((32+0))+rsp] + mov QWORD[((0+0))+rsp],r9 + mulx r9,r10,QWORD[((32+8))+rsp] + xor rax,rax + adox r8,r10 + mulx r10,r11,QWORD[((32+16))+rsp] + adox r9,r11 + mulx r11,r12,QWORD[((32+24))+rsp] + adox r10,r12 + + mov rdx,QWORD[((0+8))+rsp] + mulx r13,r12,QWORD[((32+0))+rsp] + adox r11,rax + xor rax,rax + mulx r14,r15,QWORD[((32+8))+rsp] + adox r12,r8 + mov QWORD[((0+8))+rsp],r12 + adcx r13,r15 + mulx r15,rbx,QWORD[((32+16))+rsp] + adcx r14,rbx + adox r13,r9 + mulx rbx,rbp,QWORD[((32+24))+rsp] + adcx r15,rbp + adcx rbx,rax + adox r14,r10 + + mov rdx,QWORD[((0+16))+rsp] + mulx r9,r8,QWORD[((32+0))+rsp] + adox r15,r11 + adox rbx,rax + xor rax,rax + mulx r10,r11,QWORD[((32+8))+rsp] + adox r8,r13 + mov QWORD[((0+16))+rsp],r8 + adcx r9,r11 + mulx r11,r12,QWORD[((32+16))+rsp] + adcx r10,r12 + adox r9,r14 + mulx r12,rbp,QWORD[((32+24))+rsp] + adcx r11,rbp + adcx r12,rax + + adox r10,r15 + adox r11,rbx + adox r12,rax + + mov rdx,QWORD[((0+24))+rsp] + mulx r13,r8,QWORD[((32+0))+rsp] + xor rax,rax + mulx r14,r15,QWORD[((32+8))+rsp] + adcx r13,r15 + adox r9,r8 + mulx r15,rbx,QWORD[((32+16))+rsp] + adcx r14,rbx + adox r10,r13 + mulx rbx,rbp,QWORD[((32+24))+rsp] + adcx r15,rbp + adcx rbx,rax + adox r11,r14 + adox r12,r15 + adox rbx,rax + mov QWORD[((0+24))+rsp],r9 + mov QWORD[((0+32))+rsp],r10 + mov QWORD[((0+40))+rsp],r11 + mov QWORD[((0+48))+rsp],r12 + mov QWORD[((0+56))+rsp],rbx + + + + mov rdx,QWORD[((0+0))+rdi] + mulx r8,r9,QWORD[((0+0))+rsi] + mov QWORD[((0+0))+rcx],r9 + mulx r9,r10,QWORD[((0+8))+rsi] + xor rax,rax + adox r8,r10 + mulx r10,r11,QWORD[((0+16))+rsi] + adox r9,r11 + mulx r11,r12,QWORD[((0+24))+rsi] + adox r10,r12 + + mov rdx,QWORD[((0+8))+rdi] + mulx r13,r12,QWORD[((0+0))+rsi] + adox r11,rax + xor rax,rax + mulx r14,r15,QWORD[((0+8))+rsi] + adox r12,r8 + mov QWORD[((0+8))+rcx],r12 + adcx r13,r15 + mulx r15,rbx,QWORD[((0+16))+rsi] + adcx r14,rbx + adox r13,r9 + mulx rbx,rbp,QWORD[((0+24))+rsi] + adcx r15,rbp + adcx rbx,rax + adox r14,r10 + + mov rdx,QWORD[((0+16))+rdi] + mulx r9,r8,QWORD[((0+0))+rsi] + adox r15,r11 + adox rbx,rax + xor rax,rax + mulx r10,r11,QWORD[((0+8))+rsi] + adox r8,r13 + mov QWORD[((0+16))+rcx],r8 + adcx r9,r11 + mulx r11,r12,QWORD[((0+16))+rsi] + adcx r10,r12 + adox r9,r14 + mulx r12,rbp,QWORD[((0+24))+rsi] + adcx r11,rbp + adcx r12,rax + + adox r10,r15 + adox r11,rbx + adox r12,rax + + mov rdx,QWORD[((0+24))+rdi] + mulx r13,r8,QWORD[((0+0))+rsi] + xor rax,rax + mulx r14,r15,QWORD[((0+8))+rsi] + adcx r13,r15 + adox r9,r8 + mulx r15,rbx,QWORD[((0+16))+rsi] + adcx r14,rbx + adox r10,r13 + mulx rbx,rbp,QWORD[((0+24))+rsi] + adcx r15,rbp + adcx rbx,rax + adox r11,r14 + adox r12,r15 + adox rbx,rax + mov QWORD[((0+24))+rcx],r9 + mov QWORD[((0+32))+rcx],r10 + mov QWORD[((0+40))+rcx],r11 + mov QWORD[((0+48))+rcx],r12 + mov QWORD[((0+56))+rcx],rbx + + + + mov rdx,QWORD[((32+0))+rdi] + mulx r8,r9,QWORD[((32+0))+rsi] + mov QWORD[((64+0))+rcx],r9 + mulx r9,r10,QWORD[((32+8))+rsi] + xor rax,rax + adox r8,r10 + mulx r10,r11,QWORD[((32+16))+rsi] + adox r9,r11 + + mov rdx,QWORD[((32+8))+rdi] + mulx r11,r12,QWORD[((32+0))+rsi] + adox r10,rax + xor rax,rax + + mulx r13,r14,QWORD[((32+8))+rsi] + adox r12,r8 + mov QWORD[((64+8))+rcx],r12 + adcx r11,r14 + + mulx r14,r8,QWORD[((32+16))+rsi] + adox r11,r9 + adcx r13,r8 + adcx r14,rax + adox r13,r10 + + mov rdx,QWORD[((32+16))+rdi] + mulx r9,r8,QWORD[((32+0))+rsi] + adox r14,rax + xor rax,rax + + mulx r12,r10,QWORD[((32+8))+rsi] + adox r8,r11 + mov QWORD[((64+16))+rcx],r8 + adcx r9,r13 + + mulx r8,r11,QWORD[((32+16))+rsi] + adcx r12,r14 + adcx r8,rax + adox r9,r10 + adox r11,r12 + adox r8,rax + mov QWORD[((64+24))+rcx],r9 + mov QWORD[((64+32))+rcx],r11 + mov QWORD[((64+40))+rcx],r8 + + + + + mov r8,QWORD[64+rsp] + mov r9,QWORD[72+rsp] + mov r10,QWORD[80+rsp] + mov r11,QWORD[88+rsp] + + mov rax,QWORD[32+rsp] + add r8,rax + mov rax,QWORD[40+rsp] + adc r9,rax + mov rax,QWORD[48+rsp] + adc r10,rax + mov rax,QWORD[56+rsp] + adc r11,rax + + + mov r12,QWORD[rsp] + mov r13,QWORD[8+rsp] + mov r14,QWORD[16+rsp] + mov r15,QWORD[24+rsp] + sub r12,QWORD[rcx] + sbb r13,QWORD[8+rcx] + sbb r14,QWORD[16+rcx] + sbb r15,QWORD[24+rcx] + sbb r8,QWORD[32+rcx] + sbb r9,QWORD[40+rcx] + sbb r10,QWORD[48+rcx] + sbb r11,QWORD[56+rcx] + + + sub r12,QWORD[64+rcx] + sbb r13,QWORD[72+rcx] + sbb r14,QWORD[80+rcx] + sbb r15,QWORD[88+rcx] + sbb r8,QWORD[96+rcx] + sbb r9,QWORD[104+rcx] + sbb r10,0x0 + sbb r11,0x0 + + add r12,QWORD[32+rcx] + mov QWORD[32+rcx],r12 + adc r13,QWORD[40+rcx] + mov QWORD[40+rcx],r13 + adc r14,QWORD[48+rcx] + mov QWORD[48+rcx],r14 + adc r15,QWORD[56+rcx] + mov QWORD[56+rcx],r15 + adc r8,QWORD[64+rcx] + mov QWORD[64+rcx],r8 + adc r9,QWORD[72+rcx] + mov QWORD[72+rcx],r9 + adc r10,QWORD[80+rcx] + mov QWORD[80+rcx],r10 + adc r11,QWORD[88+rcx] + mov QWORD[88+rcx],r11 + mov r12,QWORD[96+rcx] + adc r12,0x0 + mov QWORD[96+rcx],r12 + mov r13,QWORD[104+rcx] + adc r13,0x0 + mov QWORD[104+rcx],r13 + + add rsp,96 + + pop rbp + + + pop rbx + + + + + pop r15 + + + pop r14 + + + pop r13 + + + pop r12 + + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +global sike_mpmul + +sike_mpmul: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpmul: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + push r15 + + + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$mul_bdw + + + + mov rcx,rdx + + sub rsp,112 + + + + xor rax,rax + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + xor r11,r11 + add r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + + sbb rax,0 + mov QWORD[64+rsp],rax + + mov QWORD[rcx],r8 + mov QWORD[8+rcx],r9 + mov QWORD[16+rcx],r10 + mov QWORD[24+rcx],r11 + + + xor rdx,rdx + mov r12,QWORD[32+rsi] + mov r13,QWORD[40+rsi] + mov r14,QWORD[48+rsi] + xor r15,r15 + add r12,QWORD[rsi] + adc r13,QWORD[8+rsi] + adc r14,QWORD[16+rsi] + adc r15,QWORD[24+rsi] + sbb rdx,0x0 + + mov QWORD[72+rsp],rdx + + + mov rax,QWORD[rcx] + mul r12 + mov QWORD[rsp],rax + mov r8,rdx + + xor r9,r9 + mov rax,QWORD[rcx] + mul r13 + add r8,rax + adc r9,rdx + + xor r10,r10 + mov rax,QWORD[8+rcx] + mul r12 + add r8,rax + mov QWORD[8+rsp],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[rcx] + mul r14 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[16+rcx] + mul r12 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[8+rcx] + mul r13 + add r9,rax + mov QWORD[16+rsp],r9 + adc r10,rdx + adc r8,0x0 + + xor r9,r9 + mov rax,QWORD[rcx] + mul r15 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[24+rcx] + mul r12 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[8+rcx] + mul r14 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[16+rcx] + mul r13 + add r10,rax + mov QWORD[24+rsp],r10 + adc r8,rdx + adc r9,0x0 + + xor r10,r10 + mov rax,QWORD[8+rcx] + mul r15 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[24+rcx] + mul r13 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[16+rcx] + mul r14 + add r8,rax + mov QWORD[32+rsp],r8 + adc r9,rdx + adc r10,0x0 + + xor r11,r11 + mov rax,QWORD[16+rcx] + mul r15 + add r9,rax + adc r10,rdx + adc r11,0x0 + + mov rax,QWORD[24+rcx] + mul r14 + add r9,rax + mov QWORD[40+rsp],r9 + adc r10,rdx + adc r11,0x0 + + mov rax,QWORD[24+rcx] + mul r15 + add r10,rax + mov QWORD[48+rsp],r10 + adc r11,rdx + mov QWORD[56+rsp],r11 + + + mov rax,QWORD[64+rsp] + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + + mov rax,QWORD[72+rsp] + mov r8,QWORD[rcx] + and r8,rax + mov r9,QWORD[8+rcx] + and r9,rax + mov r10,QWORD[16+rcx] + and r10,rax + mov r11,QWORD[24+rcx] + and r11,rax + + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,r11 + + + mov rax,QWORD[32+rsp] + add r12,rax + mov rax,QWORD[40+rsp] + adc r13,rax + mov rax,QWORD[48+rsp] + adc r14,rax + mov rax,QWORD[56+rsp] + adc r15,rax + mov QWORD[80+rsp],r12 + mov QWORD[88+rsp],r13 + mov QWORD[96+rsp],r14 + mov QWORD[104+rsp],r15 + + + mov r11,QWORD[rdi] + mov rax,QWORD[rsi] + mul r11 + xor r9,r9 + mov QWORD[rcx],rax + mov r8,rdx + + mov r14,QWORD[16+rdi] + mov rax,QWORD[8+rsi] + mul r11 + xor r10,r10 + add r8,rax + adc r9,rdx + + mov r12,QWORD[8+rdi] + mov rax,QWORD[rsi] + mul r12 + add r8,rax + mov QWORD[8+rcx],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[16+rsi] + mul r11 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov r13,QWORD[rsi] + mov rax,r14 + mul r13 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[8+rsi] + mul r12 + add r9,rax + mov QWORD[16+rcx],r9 + adc r10,rdx + adc r8,0x0 + + xor r9,r9 + mov rax,QWORD[24+rsi] + mul r11 + mov r15,QWORD[24+rdi] + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,r15 + mul r13 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[16+rsi] + mul r12 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[8+rsi] + mul r14 + add r10,rax + mov QWORD[24+rcx],r10 + adc r8,rdx + adc r9,0x0 + + xor r10,r10 + mov rax,QWORD[24+rsi] + mul r12 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[8+rsi] + mul r15 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[16+rsi] + mul r14 + add r8,rax + mov QWORD[32+rcx],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[24+rsi] + mul r14 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[16+rsi] + mul r15 + add r9,rax + mov QWORD[40+rcx],r9 + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[24+rsi] + mul r15 + add r10,rax + mov QWORD[48+rcx],r10 + adc r8,rdx + mov QWORD[56+rcx],r8 + + + + mov r11,QWORD[32+rdi] + mov rax,QWORD[32+rsi] + mul r11 + xor r9,r9 + mov QWORD[64+rcx],rax + mov r8,rdx + + mov r14,QWORD[48+rdi] + mov rax,QWORD[40+rsi] + mul r11 + xor r10,r10 + add r8,rax + adc r9,rdx + + mov r12,QWORD[40+rdi] + mov rax,QWORD[32+rsi] + mul r12 + add r8,rax + mov QWORD[72+rcx],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[48+rsi] + mul r11 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov r13,QWORD[32+rsi] + mov rax,r14 + mul r13 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[40+rsi] + mul r12 + add r9,rax + mov QWORD[80+rcx],r9 + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[48+rsi] + mul r12 + xor r12,r12 + add r10,rax + adc r8,rdx + adc r12,0x0 + + mov rax,QWORD[40+rsi] + mul r14 + add r10,rax + adc r8,rdx + adc r12,0x0 + mov QWORD[88+rcx],r10 + + mov rax,QWORD[48+rsi] + mul r14 + add r8,rax + adc r12,0x0 + mov QWORD[96+rcx],r8 + + add rdx,r12 + + + mov r8,QWORD[rsp] + sub r8,QWORD[rcx] + mov r9,QWORD[8+rsp] + sbb r9,QWORD[8+rcx] + mov r10,QWORD[16+rsp] + sbb r10,QWORD[16+rcx] + mov r11,QWORD[24+rsp] + sbb r11,QWORD[24+rcx] + mov r12,QWORD[80+rsp] + sbb r12,QWORD[32+rcx] + mov r13,QWORD[88+rsp] + sbb r13,QWORD[40+rcx] + mov r14,QWORD[96+rsp] + sbb r14,QWORD[48+rcx] + mov r15,QWORD[104+rsp] + sbb r15,QWORD[56+rcx] + + + mov rax,QWORD[64+rcx] + sub r8,rax + mov rax,QWORD[72+rcx] + sbb r9,rax + mov rax,QWORD[80+rcx] + sbb r10,rax + mov rax,QWORD[88+rcx] + sbb r11,rax + mov rax,QWORD[96+rcx] + sbb r12,rax + sbb r13,rdx + sbb r14,0x0 + sbb r15,0x0 + + + add r8,QWORD[32+rcx] + mov QWORD[32+rcx],r8 + adc r9,QWORD[40+rcx] + mov QWORD[40+rcx],r9 + adc r10,QWORD[48+rcx] + mov QWORD[48+rcx],r10 + adc r11,QWORD[56+rcx] + mov QWORD[56+rcx],r11 + adc r12,QWORD[64+rcx] + mov QWORD[64+rcx],r12 + adc r13,QWORD[72+rcx] + mov QWORD[72+rcx],r13 + adc r14,QWORD[80+rcx] + mov QWORD[80+rcx],r14 + adc r15,QWORD[88+rcx] + mov QWORD[88+rcx],r15 + mov r12,QWORD[96+rcx] + adc r12,0x0 + mov QWORD[96+rcx],r12 + adc rdx,0x0 + mov QWORD[104+rcx],rdx + + add rsp,112 + + + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +