From c80f053ba2df2d3106ae77920baa37056f12815c Mon Sep 17 00:00:00 2001 From: KongQun Yang Date: Tue, 22 Oct 2019 11:06:32 -0700 Subject: [PATCH] Roll boringssl/src fc9c67599..76918d016 Also roll_boringssl.py script is also adjusted to work with latest boringssl. Disable ASM compilation on Windows x64 which fails to compile. It also means there are no HW AES on Windows x64. Issue #198. Change-Id: Ib7e8ff506f014c8c733f1882eeeddbe34fa28511 --- DEPS | 2 +- .../third_party/boringssl/BUILD.generated.gni | 100 +- .../boringssl/BUILD.generated_tests.gni | 114 +- packager/third_party/boringssl/boringssl.gyp | 11 +- packager/third_party/boringssl/boringssl.gypi | 80 +- packager/third_party/boringssl/err_data.c | 892 +--- .../ios-aarch64/crypto/chacha/chacha-armv8.S | 1982 +++++++++ .../crypto/fipsmodule/aesv8-armx64.S | 772 ++++ .../crypto/fipsmodule/armv8-mont.S | 1420 +++++++ .../crypto/fipsmodule/ghash-neon-armv8.S | 338 ++ .../crypto/fipsmodule/ghashv8-armx64.S | 246 ++ .../crypto/fipsmodule/sha1-armv8.S | 1232 ++++++ .../crypto/fipsmodule/sha256-armv8.S | 1210 ++++++ .../crypto/fipsmodule/sha512-armv8.S | 1082 +++++ .../crypto/fipsmodule/vpaes-armv8.S | 1213 ++++++ .../crypto/test/trampoline-armv8.S | 685 +++ .../crypto/third_party/sike/asm/fp-armv8.S | 996 +++++ .../ios-arm/crypto/chacha/chacha-armv4.S | 1498 +++++++ .../ios-arm/crypto/fipsmodule/aes-armv4.S | 1233 ++++++ .../ios-arm/crypto/fipsmodule/aesv8-armx32.S | 790 ++++ .../ios-arm/crypto/fipsmodule/armv4-mont.S | 982 +++++ .../ios-arm/crypto/fipsmodule/bsaes-armv7.S | 1536 +++++++ .../ios-arm/crypto/fipsmodule/ghash-armv4.S | 600 +++ .../crypto/fipsmodule/ghashv8-armx32.S | 256 ++ .../crypto/fipsmodule/sha1-armv4-large.S | 1518 +++++++ .../ios-arm/crypto/fipsmodule/sha256-armv4.S | 2846 +++++++++++++ .../ios-arm/crypto/fipsmodule/sha512-armv4.S | 1899 +++++++++ .../ios-arm/crypto/fipsmodule/vpaes-armv7.S | 1265 ++++++ .../ios-arm/crypto/test/trampoline-armv4.S | 377 ++ .../crypto/chacha/chacha-armv8.S | 48 +- .../crypto/fipsmodule/aesv8-armx64.S | 24 +- .../crypto/fipsmodule/armv8-mont.S | 16 + .../crypto/fipsmodule/ghash-neon-armv8.S | 341 ++ .../crypto/fipsmodule/ghashv8-armx64.S | 22 +- .../crypto/fipsmodule/sha1-armv8.S | 37 +- .../crypto/fipsmodule/sha256-armv8.S | 48 +- .../crypto/fipsmodule/sha512-armv8.S | 30 +- .../crypto/fipsmodule/vpaes-armv8.S | 1216 ++++++ .../crypto/test/trampoline-armv8.S | 688 +++ .../crypto/third_party/sike/asm/fp-armv8.S | 999 +++++ .../linux-arm/crypto/chacha/chacha-armv4.S | 20 + .../linux-arm/crypto/fipsmodule/aes-armv4.S | 87 +- .../crypto/fipsmodule/aesv8-armx32.S | 18 + .../linux-arm/crypto/fipsmodule/armv4-mont.S | 35 +- .../linux-arm/crypto/fipsmodule/bsaes-armv7.S | 1114 +---- .../linux-arm/crypto/fipsmodule/ghash-armv4.S | 28 +- .../crypto/fipsmodule/ghashv8-armx32.S | 20 +- .../crypto/fipsmodule/sha1-armv4-large.S | 16 + .../crypto/fipsmodule/sha256-armv4.S | 21 + .../crypto/fipsmodule/sha512-armv4.S | 20 + .../linux-arm/crypto/fipsmodule/vpaes-armv7.S | 1236 ++++++ .../linux-arm/crypto/test/trampoline-armv4.S | 380 ++ .../crypto/fipsmodule/aesp8-ppc.S | 3670 +++++++++++++++++ .../crypto/fipsmodule/ghashp8-ppc.S | 587 +++ .../linux-x86/crypto/chacha/chacha-x86.S | 7 + .../linux-x86/crypto/fipsmodule/aes-586.S | 67 +- .../linux-x86/crypto/fipsmodule/aesni-x86.S | 773 ++-- .../linux-x86/crypto/fipsmodule/bn-586.S | 7 + .../linux-x86/crypto/fipsmodule/co-586.S | 7 + .../crypto/fipsmodule/ghash-ssse3-x86.S | 294 ++ .../linux-x86/crypto/fipsmodule/ghash-x86.S | 7 + .../linux-x86/crypto/fipsmodule/md5-586.S | 7 + .../linux-x86/crypto/fipsmodule/sha1-586.S | 7 + .../linux-x86/crypto/fipsmodule/sha256-586.S | 7 + .../linux-x86/crypto/fipsmodule/sha512-586.S | 7 + .../linux-x86/crypto/fipsmodule/vpaes-x86.S | 73 +- .../linux-x86/crypto/fipsmodule/x86-mont.S | 23 +- .../linux-x86/crypto/test/trampoline-x86.S | 206 + .../crypto/chacha/chacha-x86_64.S | 47 + .../crypto/cipher_extra/aes128gcmsiv-x86_64.S | 13 + .../cipher_extra/chacha20_poly1305_x86_64.S | 13 + .../crypto/fipsmodule/aes-x86_64.S | 185 +- .../crypto/fipsmodule/aesni-gcm-x86_64.S | 18 + .../crypto/fipsmodule/aesni-x86_64.S | 2101 +--------- .../crypto/fipsmodule/bsaes-x86_64.S | 2503 ----------- .../crypto/fipsmodule/ghash-ssse3-x86_64.S | 427 ++ .../crypto/fipsmodule/ghash-x86_64.S | 66 + .../crypto/fipsmodule/md5-x86_64.S | 31 + .../crypto/fipsmodule/p256-x86_64-asm.S | 2828 ++++++++++++- .../crypto/fipsmodule/p256_beeu-x86_64-asm.S | 343 ++ .../crypto/fipsmodule/rdrand-x86_64.S | 29 +- .../crypto/fipsmodule/rsaz-avx2.S | 44 +- .../crypto/fipsmodule/sha1-x86_64.S | 57 + .../crypto/fipsmodule/sha256-x86_64.S | 79 +- .../crypto/fipsmodule/sha512-x86_64.S | 1142 +---- .../crypto/fipsmodule/vpaes-x86_64.S | 299 ++ .../crypto/fipsmodule/x86_64-mont.S | 470 ++- .../crypto/fipsmodule/x86_64-mont5.S | 1417 ++++++- .../crypto/test/trampoline-x86_64.S | 518 +++ .../crypto/third_party/sike/asm/fp-x86_64.S | 1871 +++++++++ .../mac-x86/crypto/chacha/chacha-x86.S | 6 + .../mac-x86/crypto/fipsmodule/aes-586.S | 46 +- .../mac-x86/crypto/fipsmodule/aesni-x86.S | 728 ++-- .../mac-x86/crypto/fipsmodule/bn-586.S | 6 + .../mac-x86/crypto/fipsmodule/co-586.S | 6 + .../crypto/fipsmodule/ghash-ssse3-x86.S | 289 ++ .../mac-x86/crypto/fipsmodule/ghash-x86.S | 6 + .../mac-x86/crypto/fipsmodule/md5-586.S | 6 + .../mac-x86/crypto/fipsmodule/sha1-586.S | 6 + .../mac-x86/crypto/fipsmodule/sha256-586.S | 6 + .../mac-x86/crypto/fipsmodule/sha512-586.S | 6 + .../mac-x86/crypto/fipsmodule/vpaes-x86.S | 72 +- .../mac-x86/crypto/fipsmodule/x86-mont.S | 22 +- .../mac-x86/crypto/test/trampoline-x86.S | 169 + .../mac-x86_64/crypto/chacha/chacha-x86_64.S | 40 + .../crypto/cipher_extra/aes128gcmsiv-x86_64.S | 12 + .../cipher_extra/chacha20_poly1305_x86_64.S | 12 + .../mac-x86_64/crypto/fipsmodule/aes-x86_64.S | 154 +- .../crypto/fipsmodule/aesni-gcm-x86_64.S | 16 + .../crypto/fipsmodule/aesni-x86_64.S | 2067 +--------- .../crypto/fipsmodule/bsaes-x86_64.S | 2500 ----------- .../crypto/fipsmodule/ghash-ssse3-x86_64.S | 426 ++ .../crypto/fipsmodule/ghash-x86_64.S | 53 + .../mac-x86_64/crypto/fipsmodule/md5-x86_64.S | 25 + .../crypto/fipsmodule/p256-x86_64-asm.S | 2753 ++++++++++++- .../crypto/fipsmodule/p256_beeu-x86_64-asm.S | 328 ++ .../crypto/fipsmodule/rdrand-x86_64.S | 28 +- .../mac-x86_64/crypto/fipsmodule/rsaz-avx2.S | 42 +- .../crypto/fipsmodule/sha1-x86_64.S | 56 + .../crypto/fipsmodule/sha256-x86_64.S | 78 +- .../crypto/fipsmodule/sha512-x86_64.S | 1141 +---- .../crypto/fipsmodule/vpaes-x86_64.S | 296 ++ .../crypto/fipsmodule/x86_64-mont.S | 468 ++- .../crypto/fipsmodule/x86_64-mont5.S | 1416 ++++++- .../crypto/test/trampoline-x86_64.S | 513 +++ .../crypto/third_party/sike/asm/fp-x86_64.S | 1869 +++++++++ .../third_party/boringssl/roll_boringssl.py | 7 +- .../win-x86/crypto/chacha/chacha-x86.asm | 6 + .../win-x86/crypto/fipsmodule/aes-586.asm | 36 +- .../win-x86/crypto/fipsmodule/aesni-x86.asm | 707 ++-- .../win-x86/crypto/fipsmodule/bn-586.asm | 6 + .../win-x86/crypto/fipsmodule/co-586.asm | 6 + .../crypto/fipsmodule/ghash-ssse3-x86.asm | 300 ++ .../win-x86/crypto/fipsmodule/ghash-x86.asm | 6 + .../win-x86/crypto/fipsmodule/md5-586.asm | 6 + .../win-x86/crypto/fipsmodule/sha1-586.asm | 6 + .../win-x86/crypto/fipsmodule/sha256-586.asm | 6 + .../win-x86/crypto/fipsmodule/sha512-586.asm | 6 + .../win-x86/crypto/fipsmodule/vpaes-x86.asm | 73 +- .../win-x86/crypto/fipsmodule/x86-mont.asm | 22 +- .../win-x86/crypto/test/trampoline-x86.asm | 164 + .../crypto/chacha/chacha-x86_64.asm | 35 + .../cipher_extra/aes128gcmsiv-x86_64.asm | 7 + .../cipher_extra/chacha20_poly1305_x86_64.asm | 9 + .../crypto/fipsmodule/aes-x86_64.asm | 189 +- .../crypto/fipsmodule/aesni-gcm-x86_64.asm | 11 + .../crypto/fipsmodule/aesni-x86_64.asm | 2386 +---------- .../crypto/fipsmodule/bsaes-x86_64.asm | 2744 ------------ .../crypto/fipsmodule/ghash-ssse3-x86_64.asm | 495 +++ .../crypto/fipsmodule/ghash-x86_64.asm | 48 + .../crypto/fipsmodule/md5-x86_64.asm | 20 + .../crypto/fipsmodule/p256-x86_64-asm.asm | 3121 +++++++++++++- .../fipsmodule/p256_beeu-x86_64-asm.asm | 339 ++ .../crypto/fipsmodule/rdrand-x86_64.asm | 50 +- .../crypto/fipsmodule/rsaz-avx2.asm | 36 +- .../crypto/fipsmodule/sha1-x86_64.asm | 51 + .../crypto/fipsmodule/sha256-x86_64.asm | 73 +- .../crypto/fipsmodule/sha512-x86_64.asm | 1166 +----- .../crypto/fipsmodule/vpaes-x86_64.asm | 335 ++ .../crypto/fipsmodule/x86_64-mont.asm | 485 ++- .../crypto/fipsmodule/x86_64-mont5.asm | 1455 ++++++- .../crypto/test/trampoline-x86_64.asm | 682 +++ .../crypto/third_party/sike/asm/fp-x86_64.asm | 1951 +++++++++ 163 files changed, 65715 insertions(+), 20886 deletions(-) create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S create mode 100644 packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S create mode 100644 packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S create mode 100644 packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S create mode 100644 packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S create mode 100644 packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S create mode 100644 packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S create mode 100644 packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S create mode 100644 packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S create mode 100644 packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S create mode 100644 packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S create mode 100644 packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S create mode 100644 packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S delete mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S create mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S create mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S create mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S create mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S create mode 100644 packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S create mode 100644 packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S delete mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S create mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S create mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S create mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S create mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S create mode 100644 packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm create mode 100644 packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm delete mode 100644 packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm create mode 100644 packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm create mode 100644 packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm create mode 100644 packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm create mode 100644 packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm diff --git a/DEPS b/DEPS index 21025f25e0..dcf924f9fc 100644 --- a/DEPS +++ b/DEPS @@ -43,7 +43,7 @@ deps = { # Make sure the version matches the one in # src/packager/third_party/boringssl, which contains perl generated files. "src/packager/third_party/boringssl/src": - Var("github") + "/google/boringssl@fc9c67599d9bdeb2e0467085133b81a8e28f77a4", + Var("github") + "/google/boringssl@76918d016414bf1d71a86d28239566fbcf8aacf0", "src/packager/third_party/curl/source": Var("github") + "/curl/curl@62c07b5743490ce373910f469abc8cdc759bec2b", #7.57.0 diff --git a/packager/third_party/boringssl/BUILD.generated.gni b/packager/third_party/boringssl/BUILD.generated.gni index f59afac5c3..56bc2574ea 100644 --- a/packager/third_party/boringssl/BUILD.generated.gni +++ b/packager/third_party/boringssl/BUILD.generated.gni @@ -57,16 +57,18 @@ crypto_sources = [ "src/crypto/bytestring/cbb.c", "src/crypto/bytestring/cbs.c", "src/crypto/bytestring/internal.h", + "src/crypto/bytestring/unicode.c", "src/crypto/chacha/chacha.c", + "src/crypto/chacha/internal.h", "src/crypto/cipher_extra/cipher_extra.c", "src/crypto/cipher_extra/derive_key.c", + "src/crypto/cipher_extra/e_aesccm.c", "src/crypto/cipher_extra/e_aesctrhmac.c", "src/crypto/cipher_extra/e_aesgcmsiv.c", "src/crypto/cipher_extra/e_chacha20poly1305.c", "src/crypto/cipher_extra/e_null.c", "src/crypto/cipher_extra/e_rc2.c", "src/crypto/cipher_extra/e_rc4.c", - "src/crypto/cipher_extra/e_ssl3.c", "src/crypto/cipher_extra/e_tls.c", "src/crypto/cipher_extra/internal.h", "src/crypto/cipher_extra/tls_cbc.c", @@ -74,14 +76,15 @@ crypto_sources = [ "src/crypto/conf/conf.c", "src/crypto/conf/conf_def.h", "src/crypto/conf/internal.h", + "src/crypto/cpu-aarch64-fuchsia.c", "src/crypto/cpu-aarch64-linux.c", "src/crypto/cpu-arm-linux.c", + "src/crypto/cpu-arm-linux.h", "src/crypto/cpu-arm.c", "src/crypto/cpu-intel.c", "src/crypto/cpu-ppc64le.c", "src/crypto/crypto.c", "src/crypto/curve25519/spake25519.c", - "src/crypto/curve25519/x25519-x86_64.c", "src/crypto/dh/check.c", "src/crypto/dh/dh.c", "src/crypto/dh/dh_asn1.c", @@ -90,7 +93,8 @@ crypto_sources = [ "src/crypto/dsa/dsa.c", "src/crypto/dsa/dsa_asn1.c", "src/crypto/ec_extra/ec_asn1.c", - "src/crypto/ecdh/ecdh.c", + "src/crypto/ec_extra/ec_derive.c", + "src/crypto/ecdh_extra/ecdh_extra.c", "src/crypto/ecdsa_extra/ecdsa_asn1.c", "src/crypto/engine/engine.c", "src/crypto/err/err.c", @@ -107,6 +111,8 @@ crypto_sources = [ "src/crypto/evp/p_ed25519_asn1.c", "src/crypto/evp/p_rsa.c", "src/crypto/evp/p_rsa_asn1.c", + "src/crypto/evp/p_x25519.c", + "src/crypto/evp/p_x25519_asn1.c", "src/crypto/evp/pbkdf.c", "src/crypto/evp/print.c", "src/crypto/evp/scrypt.c", @@ -124,11 +130,17 @@ crypto_sources = [ "src/crypto/fipsmodule/ec/internal.h", "src/crypto/fipsmodule/ec/p256-x86_64-table.h", "src/crypto/fipsmodule/ec/p256-x86_64.h", + "src/crypto/fipsmodule/fips_shared_support.c", "src/crypto/fipsmodule/is_fips.c", + "src/crypto/fipsmodule/md5/internal.h", "src/crypto/fipsmodule/modes/internal.h", "src/crypto/fipsmodule/rand/internal.h", "src/crypto/fipsmodule/rsa/internal.h", + "src/crypto/fipsmodule/sha/internal.h", + "src/crypto/fipsmodule/tls/internal.h", "src/crypto/hkdf/hkdf.c", + "src/crypto/hrss/hrss.c", + "src/crypto/hrss/internal.h", "src/crypto/internal.h", "src/crypto/lhash/lhash.c", "src/crypto/mem.c", @@ -165,6 +177,8 @@ crypto_sources = [ "src/crypto/refcount_c11.c", "src/crypto/refcount_lock.c", "src/crypto/rsa_extra/rsa_asn1.c", + "src/crypto/rsa_extra/rsa_print.c", + "src/crypto/siphash/siphash.c", "src/crypto/stack/stack.c", "src/crypto/thread.c", "src/crypto/thread_none.c", @@ -223,6 +237,7 @@ crypto_sources = [ "src/crypto/x509/x_x509.c", "src/crypto/x509/x_x509a.c", "src/crypto/x509v3/ext_dat.h", + "src/crypto/x509v3/internal.h", "src/crypto/x509v3/pcy_cache.c", "src/crypto/x509v3/pcy_data.c", "src/crypto/x509v3/pcy_int.h", @@ -246,6 +261,7 @@ crypto_sources = [ "src/crypto/x509v3/v3_int.c", "src/crypto/x509v3/v3_lib.c", "src/crypto/x509v3/v3_ncons.c", + "src/crypto/x509v3/v3_ocsp.c", "src/crypto/x509v3/v3_pci.c", "src/crypto/x509v3/v3_pcia.c", "src/crypto/x509v3/v3_pcons.c", @@ -256,6 +272,25 @@ crypto_sources = [ "src/crypto/x509v3/v3_skey.c", "src/crypto/x509v3/v3_sxnet.c", "src/crypto/x509v3/v3_utl.c", + "src/third_party/fiat/curve25519.c", + "src/third_party/fiat/curve25519_32.h", + "src/third_party/fiat/curve25519_64.h", + "src/third_party/fiat/curve25519_tables.h", + "src/third_party/fiat/internal.h", + "src/third_party/fiat/p256_32.h", + "src/third_party/fiat/p256_64.h", + "src/third_party/sike/asm/fp_generic.c", + "src/third_party/sike/curve_params.c", + "src/third_party/sike/fpx.c", + "src/third_party/sike/fpx.h", + "src/third_party/sike/isogeny.c", + "src/third_party/sike/isogeny.h", + "src/third_party/sike/sike.c", + "src/third_party/sike/sike.h", + "src/third_party/sike/utils.h", +] + +crypto_headers = [ "src/include/openssl/aead.h", "src/include/openssl/aes.h", "src/include/openssl/arm_arch.h", @@ -282,6 +317,7 @@ crypto_sources = [ "src/include/openssl/dh.h", "src/include/openssl/digest.h", "src/include/openssl/dsa.h", + "src/include/openssl/e_os2.h", "src/include/openssl/ec.h", "src/include/openssl/ec_key.h", "src/include/openssl/ecdh.h", @@ -292,9 +328,9 @@ crypto_sources = [ "src/include/openssl/ex_data.h", "src/include/openssl/hkdf.h", "src/include/openssl/hmac.h", + "src/include/openssl/hrss.h", "src/include/openssl/is_boringssl.h", "src/include/openssl/lhash.h", - "src/include/openssl/lhash_macros.h", "src/include/openssl/md4.h", "src/include/openssl/md5.h", "src/include/openssl/mem.h", @@ -317,31 +353,25 @@ crypto_sources = [ "src/include/openssl/rsa.h", "src/include/openssl/safestack.h", "src/include/openssl/sha.h", + "src/include/openssl/siphash.h", "src/include/openssl/span.h", - "src/include/openssl/srtp.h", "src/include/openssl/stack.h", "src/include/openssl/thread.h", "src/include/openssl/type_check.h", "src/include/openssl/x509.h", "src/include/openssl/x509_vfy.h", "src/include/openssl/x509v3.h", - "src/third_party/fiat/curve25519.c", - "src/third_party/fiat/internal.h", ] ssl_sources = [ - "src/include/openssl/dtls1.h", - "src/include/openssl/ssl.h", - "src/include/openssl/ssl3.h", - "src/include/openssl/tls1.h", "src/ssl/bio_ssl.cc", - "src/ssl/custom_extensions.cc", "src/ssl/d1_both.cc", "src/ssl/d1_lib.cc", "src/ssl/d1_pkt.cc", "src/ssl/d1_srtp.cc", "src/ssl/dtls_method.cc", "src/ssl/dtls_record.cc", + "src/ssl/handoff.cc", "src/ssl/handshake.cc", "src/ssl/handshake_client.cc", "src/ssl/handshake_server.cc", @@ -373,14 +403,26 @@ ssl_sources = [ "src/ssl/tls_record.cc", ] +ssl_headers = [ + "src/include/openssl/dtls1.h", + "src/include/openssl/srtp.h", + "src/include/openssl/ssl.h", + "src/include/openssl/ssl3.h", + "src/include/openssl/tls1.h", +] + crypto_sources_ios_aarch64 = [ "ios-aarch64/crypto/chacha/chacha-armv8.S", "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S", "ios-aarch64/crypto/fipsmodule/armv8-mont.S", + "ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S", "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S", "ios-aarch64/crypto/fipsmodule/sha1-armv8.S", "ios-aarch64/crypto/fipsmodule/sha256-armv8.S", "ios-aarch64/crypto/fipsmodule/sha512-armv8.S", + "ios-aarch64/crypto/fipsmodule/vpaes-armv8.S", + "ios-aarch64/crypto/test/trampoline-armv8.S", + "ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S", ] crypto_sources_ios_arm = [ @@ -394,16 +436,22 @@ crypto_sources_ios_arm = [ "ios-arm/crypto/fipsmodule/sha1-armv4-large.S", "ios-arm/crypto/fipsmodule/sha256-armv4.S", "ios-arm/crypto/fipsmodule/sha512-armv4.S", + "ios-arm/crypto/fipsmodule/vpaes-armv7.S", + "ios-arm/crypto/test/trampoline-armv4.S", ] crypto_sources_linux_aarch64 = [ "linux-aarch64/crypto/chacha/chacha-armv8.S", "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S", "linux-aarch64/crypto/fipsmodule/armv8-mont.S", + "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S", "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S", "linux-aarch64/crypto/fipsmodule/sha1-armv8.S", "linux-aarch64/crypto/fipsmodule/sha256-armv8.S", "linux-aarch64/crypto/fipsmodule/sha512-armv8.S", + "linux-aarch64/crypto/fipsmodule/vpaes-armv8.S", + "linux-aarch64/crypto/test/trampoline-armv8.S", + "linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S", ] crypto_sources_linux_arm = [ @@ -417,6 +465,8 @@ crypto_sources_linux_arm = [ "linux-arm/crypto/fipsmodule/sha1-armv4-large.S", "linux-arm/crypto/fipsmodule/sha256-armv4.S", "linux-arm/crypto/fipsmodule/sha512-armv4.S", + "linux-arm/crypto/fipsmodule/vpaes-armv7.S", + "linux-arm/crypto/test/trampoline-armv4.S", "src/crypto/curve25519/asm/x25519-asm-arm.S", "src/crypto/poly1305/poly1305_arm_asm.S", ] @@ -432,6 +482,7 @@ crypto_sources_linux_x86 = [ "linux-x86/crypto/fipsmodule/aesni-x86.S", "linux-x86/crypto/fipsmodule/bn-586.S", "linux-x86/crypto/fipsmodule/co-586.S", + "linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S", "linux-x86/crypto/fipsmodule/ghash-x86.S", "linux-x86/crypto/fipsmodule/md5-586.S", "linux-x86/crypto/fipsmodule/sha1-586.S", @@ -439,6 +490,7 @@ crypto_sources_linux_x86 = [ "linux-x86/crypto/fipsmodule/sha512-586.S", "linux-x86/crypto/fipsmodule/vpaes-x86.S", "linux-x86/crypto/fipsmodule/x86-mont.S", + "linux-x86/crypto/test/trampoline-x86.S", ] crypto_sources_linux_x86_64 = [ @@ -448,10 +500,11 @@ crypto_sources_linux_x86_64 = [ "linux-x86_64/crypto/fipsmodule/aes-x86_64.S", "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", "linux-x86_64/crypto/fipsmodule/aesni-x86_64.S", - "linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S", + "linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S", "linux-x86_64/crypto/fipsmodule/ghash-x86_64.S", "linux-x86_64/crypto/fipsmodule/md5-x86_64.S", "linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S", + "linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S", "linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S", "linux-x86_64/crypto/fipsmodule/rsaz-avx2.S", "linux-x86_64/crypto/fipsmodule/sha1-x86_64.S", @@ -460,7 +513,9 @@ crypto_sources_linux_x86_64 = [ "linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S", "linux-x86_64/crypto/fipsmodule/x86_64-mont.S", "linux-x86_64/crypto/fipsmodule/x86_64-mont5.S", - "src/crypto/curve25519/asm/x25519-asm-x86_64.S", + "linux-x86_64/crypto/test/trampoline-x86_64.S", + "linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S", + "src/crypto/hrss/asm/poly_rq_mul.S", ] crypto_sources_mac_x86 = [ @@ -469,6 +524,7 @@ crypto_sources_mac_x86 = [ "mac-x86/crypto/fipsmodule/aesni-x86.S", "mac-x86/crypto/fipsmodule/bn-586.S", "mac-x86/crypto/fipsmodule/co-586.S", + "mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S", "mac-x86/crypto/fipsmodule/ghash-x86.S", "mac-x86/crypto/fipsmodule/md5-586.S", "mac-x86/crypto/fipsmodule/sha1-586.S", @@ -476,6 +532,7 @@ crypto_sources_mac_x86 = [ "mac-x86/crypto/fipsmodule/sha512-586.S", "mac-x86/crypto/fipsmodule/vpaes-x86.S", "mac-x86/crypto/fipsmodule/x86-mont.S", + "mac-x86/crypto/test/trampoline-x86.S", ] crypto_sources_mac_x86_64 = [ @@ -485,10 +542,11 @@ crypto_sources_mac_x86_64 = [ "mac-x86_64/crypto/fipsmodule/aes-x86_64.S", "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S", - "mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S", + "mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S", "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S", "mac-x86_64/crypto/fipsmodule/md5-x86_64.S", "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S", + "mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S", "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S", "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S", "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S", @@ -497,7 +555,8 @@ crypto_sources_mac_x86_64 = [ "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S", "mac-x86_64/crypto/fipsmodule/x86_64-mont.S", "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S", - "src/crypto/curve25519/asm/x25519-asm-x86_64.S", + "mac-x86_64/crypto/test/trampoline-x86_64.S", + "mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S", ] crypto_sources_win_x86 = [ @@ -506,6 +565,7 @@ crypto_sources_win_x86 = [ "win-x86/crypto/fipsmodule/aesni-x86.asm", "win-x86/crypto/fipsmodule/bn-586.asm", "win-x86/crypto/fipsmodule/co-586.asm", + "win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm", "win-x86/crypto/fipsmodule/ghash-x86.asm", "win-x86/crypto/fipsmodule/md5-586.asm", "win-x86/crypto/fipsmodule/sha1-586.asm", @@ -513,6 +573,7 @@ crypto_sources_win_x86 = [ "win-x86/crypto/fipsmodule/sha512-586.asm", "win-x86/crypto/fipsmodule/vpaes-x86.asm", "win-x86/crypto/fipsmodule/x86-mont.asm", + "win-x86/crypto/test/trampoline-x86.asm", ] crypto_sources_win_x86_64 = [ @@ -522,10 +583,11 @@ crypto_sources_win_x86_64 = [ "win-x86_64/crypto/fipsmodule/aes-x86_64.asm", "win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm", "win-x86_64/crypto/fipsmodule/aesni-x86_64.asm", - "win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm", + "win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm", "win-x86_64/crypto/fipsmodule/ghash-x86_64.asm", "win-x86_64/crypto/fipsmodule/md5-x86_64.asm", "win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm", + "win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm", "win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm", "win-x86_64/crypto/fipsmodule/rsaz-avx2.asm", "win-x86_64/crypto/fipsmodule/sha1-x86_64.asm", @@ -534,15 +596,19 @@ crypto_sources_win_x86_64 = [ "win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm", "win-x86_64/crypto/fipsmodule/x86_64-mont.asm", "win-x86_64/crypto/fipsmodule/x86_64-mont5.asm", + "win-x86_64/crypto/test/trampoline-x86_64.asm", + "win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm", ] fuzzers = [ + "arm_cpuinfo", "bn_div", "bn_mod_exp", "cert", "client", "dtls_client", "dtls_server", + "pkcs12", "pkcs8", "privkey", "read_pem", diff --git a/packager/third_party/boringssl/BUILD.generated_tests.gni b/packager/third_party/boringssl/BUILD.generated_tests.gni index 44d653d281..c6d2db8a6b 100644 --- a/packager/third_party/boringssl/BUILD.generated_tests.gni +++ b/packager/third_party/boringssl/BUILD.generated_tests.gni @@ -5,21 +5,28 @@ # This file is created by generate_build_files.py. Do not edit manually. test_support_sources = [ + "src/crypto/test/abi_test.h", "src/crypto/test/file_test.cc", "src/crypto/test/file_test.h", "src/crypto/test/gtest_main.h", "src/crypto/test/malloc.cc", "src/crypto/test/test_util.cc", "src/crypto/test/test_util.h", + "src/crypto/test/wycheproof_util.cc", + "src/crypto/test/wycheproof_util.h", "src/ssl/test/async_bio.h", "src/ssl/test/fuzzer.h", "src/ssl/test/fuzzer_tags.h", + "src/ssl/test/handshake_util.h", "src/ssl/test/packeted_bio.h", + "src/ssl/test/settings_writer.h", "src/ssl/test/test_config.h", + "src/ssl/test/test_state.h", ] crypto_test_sources = [ "crypto_test_data.cc", + "src/crypto/abi_self_test.cc", "src/crypto/asn1/asn1_test.cc", "src/crypto/base64/base64_test.cc", "src/crypto/bio/bio_test.cc", @@ -31,13 +38,14 @@ crypto_test_sources = [ "src/crypto/cmac/cmac_test.cc", "src/crypto/compiler_test.cc", "src/crypto/constant_time_test.cc", + "src/crypto/cpu-arm-linux_test.cc", "src/crypto/curve25519/ed25519_test.cc", "src/crypto/curve25519/spake25519_test.cc", "src/crypto/curve25519/x25519_test.cc", "src/crypto/dh/dh_test.cc", "src/crypto/digest_extra/digest_test.cc", "src/crypto/dsa/dsa_test.cc", - "src/crypto/ecdh/ecdh_test.cc", + "src/crypto/ecdh_extra/ecdh_test.cc", "src/crypto/err/err_test.cc", "src/crypto/evp/evp_extra_test.cc", "src/crypto/evp/evp_test.cc", @@ -48,29 +56,133 @@ crypto_test_sources = [ "src/crypto/fipsmodule/ec/ec_test.cc", "src/crypto/fipsmodule/ec/p256-x86_64_test.cc", "src/crypto/fipsmodule/ecdsa/ecdsa_test.cc", + "src/crypto/fipsmodule/md5/md5_test.cc", "src/crypto/fipsmodule/modes/gcm_test.cc", "src/crypto/fipsmodule/rand/ctrdrbg_test.cc", + "src/crypto/fipsmodule/sha/sha_test.cc", "src/crypto/hkdf/hkdf_test.cc", "src/crypto/hmac_extra/hmac_test.cc", + "src/crypto/hrss/hrss_test.cc", + "src/crypto/impl_dispatch_test.cc", "src/crypto/lhash/lhash_test.cc", "src/crypto/obj/obj_test.cc", + "src/crypto/pem/pem_test.cc", "src/crypto/pkcs7/pkcs7_test.cc", "src/crypto/pkcs8/pkcs12_test.cc", "src/crypto/pkcs8/pkcs8_test.cc", "src/crypto/poly1305/poly1305_test.cc", "src/crypto/pool/pool_test.cc", + "src/crypto/rand_extra/rand_test.cc", "src/crypto/refcount_test.cc", "src/crypto/rsa_extra/rsa_test.cc", + "src/crypto/self_test.cc", + "src/crypto/siphash/siphash_test.cc", + "src/crypto/stack/stack_test.cc", + "src/crypto/test/abi_test.cc", "src/crypto/test/file_test_gtest.cc", "src/crypto/test/gtest_main.cc", "src/crypto/thread_test.cc", "src/crypto/x509/x509_test.cc", + "src/crypto/x509/x509_time_test.cc", "src/crypto/x509v3/tab_test.cc", "src/crypto/x509v3/v3name_test.cc", ] +crypto_test_data = [ + "src/crypto/cipher_extra/test/aes_128_cbc_sha1_tls_implicit_iv_tests.txt", + "src/crypto/cipher_extra/test/aes_128_cbc_sha1_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_128_cbc_sha256_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_128_ccm_bluetooth_8_tests.txt", + "src/crypto/cipher_extra/test/aes_128_ccm_bluetooth_tests.txt", + "src/crypto/cipher_extra/test/aes_128_ctr_hmac_sha256.txt", + "src/crypto/cipher_extra/test/aes_128_gcm_siv_tests.txt", + "src/crypto/cipher_extra/test/aes_128_gcm_tests.txt", + "src/crypto/cipher_extra/test/aes_192_gcm_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha1_tls_implicit_iv_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha1_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha256_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_256_cbc_sha384_tls_tests.txt", + "src/crypto/cipher_extra/test/aes_256_ctr_hmac_sha256.txt", + "src/crypto/cipher_extra/test/aes_256_gcm_siv_tests.txt", + "src/crypto/cipher_extra/test/aes_256_gcm_tests.txt", + "src/crypto/cipher_extra/test/chacha20_poly1305_tests.txt", + "src/crypto/cipher_extra/test/cipher_tests.txt", + "src/crypto/cipher_extra/test/des_ede3_cbc_sha1_tls_implicit_iv_tests.txt", + "src/crypto/cipher_extra/test/des_ede3_cbc_sha1_tls_tests.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_128_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_128_ctr.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_128_gcm.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_192_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_192_ctr.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_256_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_256_ctr.txt", + "src/crypto/cipher_extra/test/nist_cavp/aes_256_gcm.txt", + "src/crypto/cipher_extra/test/nist_cavp/tdes_cbc.txt", + "src/crypto/cipher_extra/test/nist_cavp/tdes_ecb.txt", + "src/crypto/cipher_extra/test/xchacha20_poly1305_tests.txt", + "src/crypto/cmac/cavp_3des_cmac_tests.txt", + "src/crypto/cmac/cavp_aes128_cmac_tests.txt", + "src/crypto/cmac/cavp_aes192_cmac_tests.txt", + "src/crypto/cmac/cavp_aes256_cmac_tests.txt", + "src/crypto/curve25519/ed25519_tests.txt", + "src/crypto/ecdh_extra/ecdh_tests.txt", + "src/crypto/evp/evp_tests.txt", + "src/crypto/evp/scrypt_tests.txt", + "src/crypto/fipsmodule/aes/aes_tests.txt", + "src/crypto/fipsmodule/bn/bn_tests.txt", + "src/crypto/fipsmodule/bn/miller_rabin_tests.txt", + "src/crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt", + "src/crypto/fipsmodule/ec/p256-x86_64_tests.txt", + "src/crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt", + "src/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt", + "src/crypto/fipsmodule/modes/gcm_tests.txt", + "src/crypto/fipsmodule/rand/ctrdrbg_vectors.txt", + "src/crypto/hmac_extra/hmac_tests.txt", + "src/crypto/poly1305/poly1305_tests.txt", + "src/crypto/siphash/siphash_tests.txt", + "src/crypto/x509/many_constraints.pem", + "src/crypto/x509/many_names1.pem", + "src/crypto/x509/many_names2.pem", + "src/crypto/x509/many_names3.pem", + "src/crypto/x509/some_names1.pem", + "src/crypto/x509/some_names2.pem", + "src/crypto/x509/some_names3.pem", + "src/third_party/wycheproof_testvectors/aes_cbc_pkcs5_test.txt", + "src/third_party/wycheproof_testvectors/aes_cmac_test.txt", + "src/third_party/wycheproof_testvectors/aes_gcm_siv_test.txt", + "src/third_party/wycheproof_testvectors/aes_gcm_test.txt", + "src/third_party/wycheproof_testvectors/chacha20_poly1305_test.txt", + "src/third_party/wycheproof_testvectors/dsa_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp224r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp256r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp384r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdh_secp521r1_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp224r1_sha224_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp224r1_sha256_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp224r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp256r1_sha256_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp256r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp384r1_sha384_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp384r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/ecdsa_secp521r1_sha512_test.txt", + "src/third_party/wycheproof_testvectors/eddsa_test.txt", + "src/third_party/wycheproof_testvectors/kw_test.txt", + "src/third_party/wycheproof_testvectors/kwp_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_2048_sha1_mgf1_20_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_2048_sha256_mgf1_0_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_2048_sha256_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_3072_sha256_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_4096_sha256_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_4096_sha512_mgf1_32_test.txt", + "src/third_party/wycheproof_testvectors/rsa_pss_misc_test.txt", + "src/third_party/wycheproof_testvectors/rsa_signature_test.txt", + "src/third_party/wycheproof_testvectors/x25519_test.txt", +] + ssl_test_sources = [ + "src/crypto/test/abi_test.cc", "src/crypto/test/gtest_main.cc", "src/ssl/span_test.cc", + "src/ssl/ssl_c_test.c", "src/ssl/ssl_test.cc", ] diff --git a/packager/third_party/boringssl/boringssl.gyp b/packager/third_party/boringssl/boringssl.gyp index 75d0829685..cd5d88c834 100644 --- a/packager/third_party/boringssl/boringssl.gyp +++ b/packager/third_party/boringssl/boringssl.gyp @@ -175,15 +175,10 @@ 'sources': [ '<@(boringssl_linux_x86_64_sources)' ], }], ['OS == "win"', { - 'sources': [ '<@(boringssl_win_x86_64_sources)' ], - # Windows' assembly is built with Yasm. The other platforms use - # the platform assembler. - 'variables': { - 'yasm_output_path': '<(SHARED_INTERMEDIATE_DIR)/third_party/boringssl', + # NOTES(kqyang): Somehow ASM fails to compile. Disable ASM. + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], }, - 'includes': [ - '../yasm/yasm_compile.gypi', - ], }], ['OS != "mac" and OS != "linux" and OS != "win" and OS != "android"', { 'direct_dependent_settings': { diff --git a/packager/third_party/boringssl/boringssl.gypi b/packager/third_party/boringssl/boringssl.gypi index 17f7a6c5ab..e7cb8f0ba2 100644 --- a/packager/third_party/boringssl/boringssl.gypi +++ b/packager/third_party/boringssl/boringssl.gypi @@ -8,17 +8,18 @@ 'variables': { 'boringssl_ssl_sources': [ 'src/include/openssl/dtls1.h', + 'src/include/openssl/srtp.h', 'src/include/openssl/ssl.h', 'src/include/openssl/ssl3.h', 'src/include/openssl/tls1.h', 'src/ssl/bio_ssl.cc', - 'src/ssl/custom_extensions.cc', 'src/ssl/d1_both.cc', 'src/ssl/d1_lib.cc', 'src/ssl/d1_pkt.cc', 'src/ssl/d1_srtp.cc', 'src/ssl/dtls_method.cc', 'src/ssl/dtls_record.cc', + 'src/ssl/handoff.cc', 'src/ssl/handshake.cc', 'src/ssl/handshake_client.cc', 'src/ssl/handshake_server.cc', @@ -102,16 +103,18 @@ 'src/crypto/bytestring/cbb.c', 'src/crypto/bytestring/cbs.c', 'src/crypto/bytestring/internal.h', + 'src/crypto/bytestring/unicode.c', 'src/crypto/chacha/chacha.c', + 'src/crypto/chacha/internal.h', 'src/crypto/cipher_extra/cipher_extra.c', 'src/crypto/cipher_extra/derive_key.c', + 'src/crypto/cipher_extra/e_aesccm.c', 'src/crypto/cipher_extra/e_aesctrhmac.c', 'src/crypto/cipher_extra/e_aesgcmsiv.c', 'src/crypto/cipher_extra/e_chacha20poly1305.c', 'src/crypto/cipher_extra/e_null.c', 'src/crypto/cipher_extra/e_rc2.c', 'src/crypto/cipher_extra/e_rc4.c', - 'src/crypto/cipher_extra/e_ssl3.c', 'src/crypto/cipher_extra/e_tls.c', 'src/crypto/cipher_extra/internal.h', 'src/crypto/cipher_extra/tls_cbc.c', @@ -119,14 +122,15 @@ 'src/crypto/conf/conf.c', 'src/crypto/conf/conf_def.h', 'src/crypto/conf/internal.h', + 'src/crypto/cpu-aarch64-fuchsia.c', 'src/crypto/cpu-aarch64-linux.c', 'src/crypto/cpu-arm-linux.c', + 'src/crypto/cpu-arm-linux.h', 'src/crypto/cpu-arm.c', 'src/crypto/cpu-intel.c', 'src/crypto/cpu-ppc64le.c', 'src/crypto/crypto.c', 'src/crypto/curve25519/spake25519.c', - 'src/crypto/curve25519/x25519-x86_64.c', 'src/crypto/dh/check.c', 'src/crypto/dh/dh.c', 'src/crypto/dh/dh_asn1.c', @@ -135,7 +139,8 @@ 'src/crypto/dsa/dsa.c', 'src/crypto/dsa/dsa_asn1.c', 'src/crypto/ec_extra/ec_asn1.c', - 'src/crypto/ecdh/ecdh.c', + 'src/crypto/ec_extra/ec_derive.c', + 'src/crypto/ecdh_extra/ecdh_extra.c', 'src/crypto/ecdsa_extra/ecdsa_asn1.c', 'src/crypto/engine/engine.c', 'src/crypto/err/err.c', @@ -152,6 +157,8 @@ 'src/crypto/evp/p_ed25519_asn1.c', 'src/crypto/evp/p_rsa.c', 'src/crypto/evp/p_rsa_asn1.c', + 'src/crypto/evp/p_x25519.c', + 'src/crypto/evp/p_x25519_asn1.c', 'src/crypto/evp/pbkdf.c', 'src/crypto/evp/print.c', 'src/crypto/evp/scrypt.c', @@ -169,11 +176,17 @@ 'src/crypto/fipsmodule/ec/internal.h', 'src/crypto/fipsmodule/ec/p256-x86_64-table.h', 'src/crypto/fipsmodule/ec/p256-x86_64.h', + 'src/crypto/fipsmodule/fips_shared_support.c', 'src/crypto/fipsmodule/is_fips.c', + 'src/crypto/fipsmodule/md5/internal.h', 'src/crypto/fipsmodule/modes/internal.h', 'src/crypto/fipsmodule/rand/internal.h', 'src/crypto/fipsmodule/rsa/internal.h', + 'src/crypto/fipsmodule/sha/internal.h', + 'src/crypto/fipsmodule/tls/internal.h', 'src/crypto/hkdf/hkdf.c', + 'src/crypto/hrss/hrss.c', + 'src/crypto/hrss/internal.h', 'src/crypto/internal.h', 'src/crypto/lhash/lhash.c', 'src/crypto/mem.c', @@ -210,6 +223,8 @@ 'src/crypto/refcount_c11.c', 'src/crypto/refcount_lock.c', 'src/crypto/rsa_extra/rsa_asn1.c', + 'src/crypto/rsa_extra/rsa_print.c', + 'src/crypto/siphash/siphash.c', 'src/crypto/stack/stack.c', 'src/crypto/thread.c', 'src/crypto/thread_none.c', @@ -268,6 +283,7 @@ 'src/crypto/x509/x_x509.c', 'src/crypto/x509/x_x509a.c', 'src/crypto/x509v3/ext_dat.h', + 'src/crypto/x509v3/internal.h', 'src/crypto/x509v3/pcy_cache.c', 'src/crypto/x509v3/pcy_data.c', 'src/crypto/x509v3/pcy_int.h', @@ -291,6 +307,7 @@ 'src/crypto/x509v3/v3_int.c', 'src/crypto/x509v3/v3_lib.c', 'src/crypto/x509v3/v3_ncons.c', + 'src/crypto/x509v3/v3_ocsp.c', 'src/crypto/x509v3/v3_pci.c', 'src/crypto/x509v3/v3_pcia.c', 'src/crypto/x509v3/v3_pcons.c', @@ -327,6 +344,7 @@ 'src/include/openssl/dh.h', 'src/include/openssl/digest.h', 'src/include/openssl/dsa.h', + 'src/include/openssl/e_os2.h', 'src/include/openssl/ec.h', 'src/include/openssl/ec_key.h', 'src/include/openssl/ecdh.h', @@ -337,9 +355,9 @@ 'src/include/openssl/ex_data.h', 'src/include/openssl/hkdf.h', 'src/include/openssl/hmac.h', + 'src/include/openssl/hrss.h', 'src/include/openssl/is_boringssl.h', 'src/include/openssl/lhash.h', - 'src/include/openssl/lhash_macros.h', 'src/include/openssl/md4.h', 'src/include/openssl/md5.h', 'src/include/openssl/mem.h', @@ -362,8 +380,8 @@ 'src/include/openssl/rsa.h', 'src/include/openssl/safestack.h', 'src/include/openssl/sha.h', + 'src/include/openssl/siphash.h', 'src/include/openssl/span.h', - 'src/include/openssl/srtp.h', 'src/include/openssl/stack.h', 'src/include/openssl/thread.h', 'src/include/openssl/type_check.h', @@ -371,16 +389,34 @@ 'src/include/openssl/x509_vfy.h', 'src/include/openssl/x509v3.h', 'src/third_party/fiat/curve25519.c', + 'src/third_party/fiat/curve25519_32.h', + 'src/third_party/fiat/curve25519_64.h', + 'src/third_party/fiat/curve25519_tables.h', 'src/third_party/fiat/internal.h', + 'src/third_party/fiat/p256_32.h', + 'src/third_party/fiat/p256_64.h', + 'src/third_party/sike/asm/fp_generic.c', + 'src/third_party/sike/curve_params.c', + 'src/third_party/sike/fpx.c', + 'src/third_party/sike/fpx.h', + 'src/third_party/sike/isogeny.c', + 'src/third_party/sike/isogeny.h', + 'src/third_party/sike/sike.c', + 'src/third_party/sike/sike.h', + 'src/third_party/sike/utils.h', ], 'boringssl_ios_aarch64_sources': [ 'ios-aarch64/crypto/chacha/chacha-armv8.S', 'ios-aarch64/crypto/fipsmodule/aesv8-armx64.S', 'ios-aarch64/crypto/fipsmodule/armv8-mont.S', + 'ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S', 'ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S', 'ios-aarch64/crypto/fipsmodule/sha1-armv8.S', 'ios-aarch64/crypto/fipsmodule/sha256-armv8.S', 'ios-aarch64/crypto/fipsmodule/sha512-armv8.S', + 'ios-aarch64/crypto/fipsmodule/vpaes-armv8.S', + 'ios-aarch64/crypto/test/trampoline-armv8.S', + 'ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S', ], 'boringssl_ios_arm_sources': [ 'ios-arm/crypto/chacha/chacha-armv4.S', @@ -393,15 +429,21 @@ 'ios-arm/crypto/fipsmodule/sha1-armv4-large.S', 'ios-arm/crypto/fipsmodule/sha256-armv4.S', 'ios-arm/crypto/fipsmodule/sha512-armv4.S', + 'ios-arm/crypto/fipsmodule/vpaes-armv7.S', + 'ios-arm/crypto/test/trampoline-armv4.S', ], 'boringssl_linux_aarch64_sources': [ 'linux-aarch64/crypto/chacha/chacha-armv8.S', 'linux-aarch64/crypto/fipsmodule/aesv8-armx64.S', 'linux-aarch64/crypto/fipsmodule/armv8-mont.S', + 'linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S', 'linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S', 'linux-aarch64/crypto/fipsmodule/sha1-armv8.S', 'linux-aarch64/crypto/fipsmodule/sha256-armv8.S', 'linux-aarch64/crypto/fipsmodule/sha512-armv8.S', + 'linux-aarch64/crypto/fipsmodule/vpaes-armv8.S', + 'linux-aarch64/crypto/test/trampoline-armv8.S', + 'linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S', ], 'boringssl_linux_arm_sources': [ 'linux-arm/crypto/chacha/chacha-armv4.S', @@ -414,6 +456,8 @@ 'linux-arm/crypto/fipsmodule/sha1-armv4-large.S', 'linux-arm/crypto/fipsmodule/sha256-armv4.S', 'linux-arm/crypto/fipsmodule/sha512-armv4.S', + 'linux-arm/crypto/fipsmodule/vpaes-armv7.S', + 'linux-arm/crypto/test/trampoline-armv4.S', 'src/crypto/curve25519/asm/x25519-asm-arm.S', 'src/crypto/poly1305/poly1305_arm_asm.S', ], @@ -427,6 +471,7 @@ 'linux-x86/crypto/fipsmodule/aesni-x86.S', 'linux-x86/crypto/fipsmodule/bn-586.S', 'linux-x86/crypto/fipsmodule/co-586.S', + 'linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S', 'linux-x86/crypto/fipsmodule/ghash-x86.S', 'linux-x86/crypto/fipsmodule/md5-586.S', 'linux-x86/crypto/fipsmodule/sha1-586.S', @@ -434,6 +479,7 @@ 'linux-x86/crypto/fipsmodule/sha512-586.S', 'linux-x86/crypto/fipsmodule/vpaes-x86.S', 'linux-x86/crypto/fipsmodule/x86-mont.S', + 'linux-x86/crypto/test/trampoline-x86.S', ], 'boringssl_linux_x86_64_sources': [ 'linux-x86_64/crypto/chacha/chacha-x86_64.S', @@ -442,10 +488,11 @@ 'linux-x86_64/crypto/fipsmodule/aes-x86_64.S', 'linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S', 'linux-x86_64/crypto/fipsmodule/aesni-x86_64.S', - 'linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S', 'linux-x86_64/crypto/fipsmodule/ghash-x86_64.S', 'linux-x86_64/crypto/fipsmodule/md5-x86_64.S', 'linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S', + 'linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S', 'linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S', 'linux-x86_64/crypto/fipsmodule/rsaz-avx2.S', 'linux-x86_64/crypto/fipsmodule/sha1-x86_64.S', @@ -454,7 +501,9 @@ 'linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S', 'linux-x86_64/crypto/fipsmodule/x86_64-mont.S', 'linux-x86_64/crypto/fipsmodule/x86_64-mont5.S', - 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', + 'linux-x86_64/crypto/test/trampoline-x86_64.S', + 'linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S', + 'src/crypto/hrss/asm/poly_rq_mul.S', ], 'boringssl_mac_x86_sources': [ 'mac-x86/crypto/chacha/chacha-x86.S', @@ -462,6 +511,7 @@ 'mac-x86/crypto/fipsmodule/aesni-x86.S', 'mac-x86/crypto/fipsmodule/bn-586.S', 'mac-x86/crypto/fipsmodule/co-586.S', + 'mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S', 'mac-x86/crypto/fipsmodule/ghash-x86.S', 'mac-x86/crypto/fipsmodule/md5-586.S', 'mac-x86/crypto/fipsmodule/sha1-586.S', @@ -469,6 +519,7 @@ 'mac-x86/crypto/fipsmodule/sha512-586.S', 'mac-x86/crypto/fipsmodule/vpaes-x86.S', 'mac-x86/crypto/fipsmodule/x86-mont.S', + 'mac-x86/crypto/test/trampoline-x86.S', ], 'boringssl_mac_x86_64_sources': [ 'mac-x86_64/crypto/chacha/chacha-x86_64.S', @@ -477,10 +528,11 @@ 'mac-x86_64/crypto/fipsmodule/aes-x86_64.S', 'mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S', 'mac-x86_64/crypto/fipsmodule/aesni-x86_64.S', - 'mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S', 'mac-x86_64/crypto/fipsmodule/ghash-x86_64.S', 'mac-x86_64/crypto/fipsmodule/md5-x86_64.S', 'mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S', + 'mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S', 'mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S', 'mac-x86_64/crypto/fipsmodule/rsaz-avx2.S', 'mac-x86_64/crypto/fipsmodule/sha1-x86_64.S', @@ -489,7 +541,8 @@ 'mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S', 'mac-x86_64/crypto/fipsmodule/x86_64-mont.S', 'mac-x86_64/crypto/fipsmodule/x86_64-mont5.S', - 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', + 'mac-x86_64/crypto/test/trampoline-x86_64.S', + 'mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S', ], 'boringssl_win_x86_sources': [ 'win-x86/crypto/chacha/chacha-x86.asm', @@ -497,6 +550,7 @@ 'win-x86/crypto/fipsmodule/aesni-x86.asm', 'win-x86/crypto/fipsmodule/bn-586.asm', 'win-x86/crypto/fipsmodule/co-586.asm', + 'win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm', 'win-x86/crypto/fipsmodule/ghash-x86.asm', 'win-x86/crypto/fipsmodule/md5-586.asm', 'win-x86/crypto/fipsmodule/sha1-586.asm', @@ -504,6 +558,7 @@ 'win-x86/crypto/fipsmodule/sha512-586.asm', 'win-x86/crypto/fipsmodule/vpaes-x86.asm', 'win-x86/crypto/fipsmodule/x86-mont.asm', + 'win-x86/crypto/test/trampoline-x86.asm', ], 'boringssl_win_x86_64_sources': [ 'win-x86_64/crypto/chacha/chacha-x86_64.asm', @@ -512,10 +567,11 @@ 'win-x86_64/crypto/fipsmodule/aes-x86_64.asm', 'win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm', 'win-x86_64/crypto/fipsmodule/aesni-x86_64.asm', - 'win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm', 'win-x86_64/crypto/fipsmodule/ghash-x86_64.asm', 'win-x86_64/crypto/fipsmodule/md5-x86_64.asm', 'win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm', + 'win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm', 'win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm', 'win-x86_64/crypto/fipsmodule/rsaz-avx2.asm', 'win-x86_64/crypto/fipsmodule/sha1-x86_64.asm', @@ -524,6 +580,8 @@ 'win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm', 'win-x86_64/crypto/fipsmodule/x86_64-mont.asm', 'win-x86_64/crypto/fipsmodule/x86_64-mont5.asm', + 'win-x86_64/crypto/test/trampoline-x86_64.asm', + 'win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm', ], } } diff --git a/packager/third_party/boringssl/err_data.c b/packager/third_party/boringssl/err_data.c index 931a44b643..7fcbc56ac0 100644 --- a/packager/third_party/boringssl/err_data.c +++ b/packager/third_party/boringssl/err_data.c @@ -18,716 +18,159 @@ #include #include +OPENSSL_STATIC_ASSERT(ERR_LIB_NONE == 1, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_SYS == 2, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_BN == 3, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_RSA == 4, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_DH == 5, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_EVP == 6, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_BUF == 7, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_OBJ == 8, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_PEM == 9, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_DSA == 10, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_X509 == 11, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ASN1 == 12, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_CONF == 13, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_CRYPTO == 14, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_EC == 15, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_SSL == 16, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_BIO == 17, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_PKCS7 == 18, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_PKCS8 == 19, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_X509V3 == 20, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_RAND == 21, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ENGINE == 22, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_OCSP == 23, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_UI == 24, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_COMP == 25, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ECDSA == 26, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_ECDH == 27, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_HMAC == 28, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_DIGEST == 29, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_CIPHER == 30, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_HKDF == 31, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_LIB_USER == 32, "library value changed"); +OPENSSL_STATIC_ASSERT(ERR_NUM_LIBS == 33, "number of libraries changed"); -OPENSSL_COMPILE_ASSERT(ERR_LIB_NONE == 1, library_values_changed_1); -OPENSSL_COMPILE_ASSERT(ERR_LIB_SYS == 2, library_values_changed_2); -OPENSSL_COMPILE_ASSERT(ERR_LIB_BN == 3, library_values_changed_3); -OPENSSL_COMPILE_ASSERT(ERR_LIB_RSA == 4, library_values_changed_4); -OPENSSL_COMPILE_ASSERT(ERR_LIB_DH == 5, library_values_changed_5); -OPENSSL_COMPILE_ASSERT(ERR_LIB_EVP == 6, library_values_changed_6); -OPENSSL_COMPILE_ASSERT(ERR_LIB_BUF == 7, library_values_changed_7); -OPENSSL_COMPILE_ASSERT(ERR_LIB_OBJ == 8, library_values_changed_8); -OPENSSL_COMPILE_ASSERT(ERR_LIB_PEM == 9, library_values_changed_9); -OPENSSL_COMPILE_ASSERT(ERR_LIB_DSA == 10, library_values_changed_10); -OPENSSL_COMPILE_ASSERT(ERR_LIB_X509 == 11, library_values_changed_11); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ASN1 == 12, library_values_changed_12); -OPENSSL_COMPILE_ASSERT(ERR_LIB_CONF == 13, library_values_changed_13); -OPENSSL_COMPILE_ASSERT(ERR_LIB_CRYPTO == 14, library_values_changed_14); -OPENSSL_COMPILE_ASSERT(ERR_LIB_EC == 15, library_values_changed_15); -OPENSSL_COMPILE_ASSERT(ERR_LIB_SSL == 16, library_values_changed_16); -OPENSSL_COMPILE_ASSERT(ERR_LIB_BIO == 17, library_values_changed_17); -OPENSSL_COMPILE_ASSERT(ERR_LIB_PKCS7 == 18, library_values_changed_18); -OPENSSL_COMPILE_ASSERT(ERR_LIB_PKCS8 == 19, library_values_changed_19); -OPENSSL_COMPILE_ASSERT(ERR_LIB_X509V3 == 20, library_values_changed_20); -OPENSSL_COMPILE_ASSERT(ERR_LIB_RAND == 21, library_values_changed_21); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ENGINE == 22, library_values_changed_22); -OPENSSL_COMPILE_ASSERT(ERR_LIB_OCSP == 23, library_values_changed_23); -OPENSSL_COMPILE_ASSERT(ERR_LIB_UI == 24, library_values_changed_24); -OPENSSL_COMPILE_ASSERT(ERR_LIB_COMP == 25, library_values_changed_25); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ECDSA == 26, library_values_changed_26); -OPENSSL_COMPILE_ASSERT(ERR_LIB_ECDH == 27, library_values_changed_27); -OPENSSL_COMPILE_ASSERT(ERR_LIB_HMAC == 28, library_values_changed_28); -OPENSSL_COMPILE_ASSERT(ERR_LIB_DIGEST == 29, library_values_changed_29); -OPENSSL_COMPILE_ASSERT(ERR_LIB_CIPHER == 30, library_values_changed_30); -OPENSSL_COMPILE_ASSERT(ERR_LIB_HKDF == 31, library_values_changed_31); -OPENSSL_COMPILE_ASSERT(ERR_LIB_USER == 32, library_values_changed_32); -OPENSSL_COMPILE_ASSERT(ERR_NUM_LIBS == 33, library_values_changed_num); - -// clang-format off const uint32_t kOpenSSLReasonValues[] = { - 0xc320838, - 0xc328852, - 0xc330861, - 0xc338871, - 0xc340880, - 0xc348899, - 0xc3508a5, - 0xc3588c2, - 0xc3608e2, - 0xc3688f0, - 0xc370900, - 0xc37890d, - 0xc38091d, - 0xc388928, - 0xc39093e, - 0xc39894d, - 0xc3a0961, - 0xc3a8845, - 0xc3b00ea, - 0xc3b88d4, - 0x10320845, - 0x10329535, - 0x10331541, - 0x1033955a, - 0x1034156d, - 0x10348efc, - 0x10350c5e, - 0x10359580, - 0x10361595, - 0x103695a8, - 0x103715c7, - 0x103795e0, - 0x103815f5, - 0x10389613, - 0x10391622, - 0x1039963e, - 0x103a1659, - 0x103a9668, - 0x103b1684, - 0x103b969f, - 0x103c16b6, - 0x103c80ea, - 0x103d16c7, - 0x103d96db, - 0x103e16fa, - 0x103e9709, - 0x103f1720, - 0x103f9733, - 0x10400c22, - 0x10409746, - 0x10411764, - 0x10419777, - 0x10421791, - 0x104297a1, - 0x104317b5, - 0x104397cb, - 0x104417e3, - 0x104497f8, - 0x1045180c, - 0x1045981e, - 0x104605fb, - 0x1046894d, - 0x10471833, - 0x1047984a, - 0x1048185f, - 0x1048986d, - 0x10490e5e, - 0x14320c05, - 0x14328c13, - 0x14330c22, - 0x14338c34, - 0x143400ac, - 0x143480ea, - 0x18320083, - 0x18328f52, - 0x183300ac, - 0x18338f68, - 0x18340f7c, - 0x183480ea, - 0x18350f91, - 0x18358fa9, - 0x18360fbe, - 0x18368fd2, - 0x18370ff6, - 0x1837900c, - 0x18381020, - 0x18389030, - 0x18390a73, - 0x18399040, - 0x183a1068, - 0x183a908e, - 0x183b0c6a, - 0x183b90c3, - 0x183c10d5, - 0x183c90e0, - 0x183d10f0, - 0x183d9101, - 0x183e1112, - 0x183e9124, - 0x183f114d, - 0x183f9166, - 0x1840117e, - 0x184086d3, - 0x184110b1, - 0x1841907c, - 0x1842109b, - 0x18429055, - 0x203211b8, - 0x203291a5, - 0x243211c4, - 0x24328993, - 0x243311d6, - 0x243391e3, - 0x243411f0, - 0x24349202, - 0x24351211, - 0x2435922e, - 0x2436123b, - 0x24369249, - 0x24371257, - 0x24379265, - 0x2438126e, - 0x2438927b, - 0x2439128e, - 0x28320c52, - 0x28328c6a, - 0x28330c22, - 0x28338c7d, - 0x28340c5e, - 0x283480ac, - 0x283500ea, - 0x2c322c6c, - 0x2c3292a5, - 0x2c332c7a, - 0x2c33ac8c, - 0x2c342ca0, - 0x2c34acb2, - 0x2c352ccd, - 0x2c35acdf, - 0x2c362cf2, - 0x2c36832d, - 0x2c372cff, - 0x2c37ad11, - 0x2c382d36, - 0x2c38ad4d, - 0x2c392d5b, - 0x2c39ad6b, - 0x2c3a2d7d, - 0x2c3aad91, - 0x2c3b2da2, - 0x2c3badc1, - 0x2c3c12b7, - 0x2c3c92cd, - 0x2c3d2dd5, - 0x2c3d92e6, - 0x2c3e2df2, - 0x2c3eae00, - 0x2c3f2e18, - 0x2c3fae30, - 0x2c402e3d, - 0x2c4091b8, - 0x2c412e4e, - 0x2c41ae61, - 0x2c42117e, - 0x2c42ae72, - 0x2c430720, - 0x2c43adb3, - 0x2c442d24, - 0x30320000, - 0x30328015, - 0x3033001f, - 0x30338038, - 0x3034004a, - 0x30348064, - 0x3035006b, - 0x30358083, - 0x30360094, - 0x303680ac, - 0x303700b9, - 0x303780c8, - 0x303800ea, - 0x303880f7, - 0x3039010a, - 0x30398125, - 0x303a013a, - 0x303a814e, - 0x303b0162, - 0x303b8173, - 0x303c018c, - 0x303c81a9, - 0x303d01b7, - 0x303d81cb, - 0x303e01db, - 0x303e81f4, - 0x303f0204, - 0x303f8217, - 0x30400226, - 0x30408232, - 0x30410247, - 0x30418257, - 0x3042026e, - 0x3042827b, - 0x3043028e, - 0x3043829d, - 0x304402b2, - 0x304482d3, - 0x304502e6, - 0x304582f9, - 0x30460312, - 0x3046832d, - 0x3047034a, - 0x30478363, - 0x30480371, - 0x30488382, - 0x30490391, - 0x304983a9, - 0x304a03bb, - 0x304a83cf, - 0x304b03ee, - 0x304b8401, - 0x304c040c, - 0x304c841d, - 0x304d0429, - 0x304d843f, - 0x304e044d, - 0x304e8463, - 0x304f0475, - 0x304f8487, - 0x3050049a, - 0x305084ad, - 0x305104be, - 0x305184ce, - 0x305204e6, - 0x305284fb, - 0x30530513, - 0x30538527, - 0x3054053f, - 0x30548558, - 0x30550571, - 0x3055858e, - 0x30560599, - 0x305685b1, - 0x305705c1, - 0x305785d2, - 0x305805e5, - 0x305885fb, - 0x30590604, - 0x30598619, - 0x305a062c, - 0x305a863b, - 0x305b065b, - 0x305b866a, - 0x305c068b, - 0x305c86a7, - 0x305d06b3, - 0x305d86d3, - 0x305e06ef, - 0x305e8700, - 0x305f0716, - 0x305f8720, - 0x34320b63, - 0x34328b77, - 0x34330b94, - 0x34338ba7, - 0x34340bb6, - 0x34348bef, - 0x34350bd3, - 0x3c320083, - 0x3c328ca7, - 0x3c330cc0, - 0x3c338cdb, - 0x3c340cf8, - 0x3c348d22, - 0x3c350d3d, - 0x3c358d63, - 0x3c360d7c, - 0x3c368d94, - 0x3c370da5, - 0x3c378db3, - 0x3c380dc0, - 0x3c388dd4, - 0x3c390c6a, - 0x3c398df7, - 0x3c3a0e0b, - 0x3c3a890d, - 0x3c3b0e1b, - 0x3c3b8e36, - 0x3c3c0e48, - 0x3c3c8e7b, - 0x3c3d0e85, - 0x3c3d8e99, - 0x3c3e0ea7, - 0x3c3e8ecc, - 0x3c3f0c93, - 0x3c3f8eb5, - 0x3c4000ac, - 0x3c4080ea, - 0x3c410d13, - 0x3c418d52, - 0x3c420e5e, - 0x3c428de8, - 0x403218c6, - 0x403298dc, - 0x4033190a, - 0x40339914, - 0x4034192b, - 0x40349949, - 0x40351959, - 0x4035996b, - 0x40361978, - 0x40369984, - 0x40371999, - 0x403799ab, - 0x403819b6, - 0x403899c8, - 0x40390efc, - 0x403999d8, - 0x403a19eb, - 0x403a9a0c, - 0x403b1a1d, - 0x403b9a2d, - 0x403c0064, - 0x403c8083, - 0x403d1ab1, - 0x403d9ac7, - 0x403e1ad6, - 0x403e9b0e, - 0x403f1b28, - 0x403f9b36, - 0x40401b4b, - 0x40409b5f, - 0x40411b7c, - 0x40419b97, - 0x40421bb0, - 0x40429bc3, - 0x40431bd7, - 0x40439bef, - 0x40441c06, - 0x404480ac, - 0x40451c1b, - 0x40459c2d, - 0x40461c51, - 0x40469c71, - 0x40471c7f, - 0x40479ca6, - 0x40481ce3, - 0x40489d16, - 0x40491d2d, - 0x40499d47, - 0x404a1d5e, - 0x404a9d7c, - 0x404b1d94, - 0x404b9dab, - 0x404c1dc1, - 0x404c9dd3, - 0x404d1df4, - 0x404d9e16, - 0x404e1e2a, - 0x404e9e37, - 0x404f1e64, - 0x404f9e8d, - 0x40501ec8, - 0x40509edc, - 0x40511ef7, - 0x40521f07, - 0x40529f2b, - 0x40531f43, - 0x40539f56, - 0x40541f6b, - 0x40549f8e, - 0x40551f9c, - 0x40559fb9, - 0x40561fc6, - 0x40569fdf, - 0x40571ff7, - 0x4057a00a, - 0x4058201f, - 0x4058a046, - 0x40592075, - 0x4059a0a2, - 0x405a20b6, - 0x405aa0c6, - 0x405b20de, - 0x405ba0ef, - 0x405c2102, - 0x405ca141, - 0x405d214e, - 0x405da165, - 0x405e21a3, - 0x405e8ab1, - 0x405f21c4, - 0x405fa1d1, - 0x406021df, - 0x4060a201, - 0x40612245, - 0x4061a27d, - 0x40622294, - 0x4062a2a5, - 0x406322b6, - 0x4063a2cb, - 0x406422e2, - 0x4064a30e, - 0x40652329, - 0x4065a340, - 0x40662358, - 0x4066a382, - 0x406723ad, - 0x4067a3ce, - 0x406823f5, - 0x4068a416, - 0x40692448, - 0x4069a476, - 0x406a2497, - 0x406aa4b7, - 0x406b263f, - 0x406ba662, - 0x406c2678, - 0x406ca8f3, - 0x406d2922, - 0x406da94a, - 0x406e2978, - 0x406ea9c5, - 0x406f29e4, - 0x406faa1c, - 0x40702a2f, - 0x4070aa4c, - 0x40710800, - 0x4071aa5e, - 0x40722a71, - 0x4072aa8a, - 0x40732aa2, - 0x407394a4, - 0x40742ab6, - 0x4074aad0, - 0x40752ae1, - 0x4075aaf5, - 0x40762b03, - 0x4076927b, - 0x40772b28, - 0x4077ab4a, - 0x40782b65, - 0x4078ab9e, - 0x40792bb5, - 0x4079abcb, - 0x407a2bd7, - 0x407aabea, - 0x407b2bff, - 0x407bac11, - 0x407c2c42, - 0x407cac4b, - 0x407d2431, - 0x407d9e9d, - 0x407e2b7a, - 0x407ea056, - 0x407f1c93, - 0x407f9a53, - 0x40801e74, - 0x40809cbb, - 0x40811f19, - 0x40819e4e, - 0x40822963, - 0x40829a39, - 0x40832031, - 0x4083a2f3, - 0x40841ccf, - 0x4084a08e, - 0x40852113, - 0x4085a229, - 0x40862185, - 0x40869eb7, - 0x408729a9, - 0x4087a25a, - 0x40881a9a, - 0x4088a3e1, - 0x40891ae9, - 0x40899a76, - 0x408a2698, - 0x408a9884, - 0x408b2c26, - 0x408ba9f9, - 0x408c2123, - 0x408c98a0, - 0x408d1cfc, - 0x41f4256a, - 0x41f925fc, - 0x41fe24ef, - 0x41fea6e4, - 0x41ff27d5, - 0x42032583, - 0x420825a5, - 0x4208a5e1, - 0x420924d3, - 0x4209a61b, - 0x420a252a, - 0x420aa50a, - 0x420b254a, - 0x420ba5c3, - 0x420c27f1, - 0x420ca6b1, - 0x420d26cb, - 0x420da702, - 0x4212271c, - 0x421727b8, - 0x4217a75e, - 0x421c2780, - 0x421f273b, - 0x42212808, - 0x4226279b, - 0x422b28d7, - 0x422ba885, - 0x422c28bf, - 0x422ca844, - 0x422d2823, - 0x422da8a4, - 0x422e286a, - 0x422ea990, - 0x4432072b, - 0x4432873a, - 0x44330746, - 0x44338754, - 0x44340767, - 0x44348778, - 0x4435077f, - 0x44358789, - 0x4436079c, - 0x443687b2, - 0x443707c4, - 0x443787d1, - 0x443807e0, - 0x443887e8, - 0x44390800, - 0x4439880e, - 0x443a0821, - 0x483212a5, - 0x483292b7, - 0x483312cd, - 0x483392e6, - 0x4c32130b, - 0x4c32931b, - 0x4c33132e, - 0x4c33934e, - 0x4c3400ac, - 0x4c3480ea, - 0x4c35135a, - 0x4c359368, - 0x4c361384, - 0x4c369397, - 0x4c3713a6, - 0x4c3793b4, - 0x4c3813c9, - 0x4c3893d5, - 0x4c3913f5, - 0x4c39941f, - 0x4c3a1438, - 0x4c3a9451, - 0x4c3b05fb, - 0x4c3b946a, - 0x4c3c147c, - 0x4c3c948b, - 0x4c3d14a4, - 0x4c3d8c45, - 0x4c3e14fd, - 0x4c3e94b3, - 0x4c3f151f, - 0x4c3f927b, - 0x4c4014c9, - 0x4c4092f7, - 0x4c4114ed, - 0x50322e84, - 0x5032ae93, - 0x50332e9e, - 0x5033aeae, - 0x50342ec7, - 0x5034aee1, - 0x50352eef, - 0x5035af05, - 0x50362f17, - 0x5036af2d, - 0x50372f46, - 0x5037af59, - 0x50382f71, - 0x5038af82, - 0x50392f97, - 0x5039afab, - 0x503a2fcb, - 0x503aafe1, - 0x503b2ff9, - 0x503bb00b, - 0x503c3027, - 0x503cb03e, - 0x503d3057, - 0x503db06d, - 0x503e307a, - 0x503eb090, - 0x503f30a2, - 0x503f8382, - 0x504030b5, - 0x5040b0c5, - 0x504130df, - 0x5041b0ee, - 0x50423108, - 0x5042b125, - 0x50433135, - 0x5043b145, - 0x50443154, - 0x5044843f, - 0x50453168, - 0x5045b186, - 0x50463199, - 0x5046b1af, - 0x504731c1, - 0x5047b1d6, - 0x504831fc, - 0x5048b20a, - 0x5049321d, - 0x5049b232, - 0x504a3248, - 0x504ab258, - 0x504b3278, - 0x504bb28b, - 0x504c32ae, - 0x504cb2dc, - 0x504d32ee, - 0x504db30b, - 0x504e3326, - 0x504eb342, - 0x504f3354, - 0x504fb36b, - 0x5050337a, - 0x505086ef, - 0x5051338d, - 0x58320f3a, - 0x68320efc, - 0x68328c6a, - 0x68330c7d, - 0x68338f0a, - 0x68340f1a, - 0x683480ea, - 0x6c320ed8, - 0x6c328c34, - 0x6c330ee3, - 0x74320a19, - 0x743280ac, - 0x74330c45, - 0x7832097e, - 0x78328993, - 0x7833099f, - 0x78338083, - 0x783409ae, - 0x783489c3, - 0x783509e2, - 0x78358a04, - 0x78360a19, - 0x78368a2f, - 0x78370a3f, - 0x78378a60, - 0x78380a73, - 0x78388a85, - 0x78390a92, - 0x78398ab1, - 0x783a0ac6, - 0x783a8ad4, - 0x783b0ade, - 0x783b8af2, - 0x783c0b09, - 0x783c8b1e, - 0x783d0b35, - 0x783d8b4a, - 0x783e0aa0, - 0x783e8a52, - 0x7c321194, + 0xc32083a, 0xc328854, 0xc330863, 0xc338873, 0xc340882, 0xc34889b, + 0xc3508a7, 0xc3588c4, 0xc3608e4, 0xc3688f2, 0xc370902, 0xc37890f, + 0xc38091f, 0xc38892a, 0xc390940, 0xc39894f, 0xc3a0963, 0xc3a8847, + 0xc3b00ea, 0xc3b88d6, 0x10320847, 0x1032959f, 0x103315ab, 0x103395c4, + 0x103415d7, 0x10348f27, 0x10350c60, 0x103595ea, 0x10361614, 0x10369627, + 0x10371646, 0x1037965f, 0x10381674, 0x10389692, 0x103916a1, 0x103996bd, + 0x103a16d8, 0x103a96e7, 0x103b1703, 0x103b971e, 0x103c1744, 0x103c80ea, + 0x103d1755, 0x103d9769, 0x103e1788, 0x103e9797, 0x103f17ae, 0x103f97c1, + 0x10400c24, 0x104097d4, 0x104117f2, 0x10419805, 0x1042181f, 0x1042982f, + 0x10431843, 0x10439859, 0x10441871, 0x10449886, 0x1045189a, 0x104598ac, + 0x104605fd, 0x1046894f, 0x104718c1, 0x104798d8, 0x104818ed, 0x104898fb, + 0x10490e73, 0x10499735, 0x104a15ff, 0x14320c07, 0x14328c15, 0x14330c24, + 0x14338c36, 0x143400ac, 0x143480ea, 0x18320083, 0x18328f7d, 0x183300ac, + 0x18338f93, 0x18340fa7, 0x183480ea, 0x18350fbc, 0x18358fd4, 0x18360fe9, + 0x18368ffd, 0x18371021, 0x18379037, 0x1838104b, 0x1838905b, 0x18390a75, + 0x1839906b, 0x183a1091, 0x183a90b7, 0x183b0c7f, 0x183b9106, 0x183c1118, + 0x183c9123, 0x183d1133, 0x183d9144, 0x183e1155, 0x183e9167, 0x183f1190, + 0x183f91a9, 0x184011c1, 0x184086d5, 0x184110da, 0x184190a5, 0x184210c4, + 0x18428c6c, 0x18431080, 0x184390ec, 0x203211fb, 0x203291e8, 0x24321207, + 0x24328995, 0x24331219, 0x24339226, 0x24341233, 0x24349245, 0x24351254, + 0x24359271, 0x2436127e, 0x2436928c, 0x2437129a, 0x243792a8, 0x243812b1, + 0x243892be, 0x243912d1, 0x28320c54, 0x28328c7f, 0x28330c24, 0x28338c92, + 0x28340c60, 0x283480ac, 0x283500ea, 0x28358c6c, 0x2c322f0c, 0x2c3292e8, + 0x2c332f1a, 0x2c33af2c, 0x2c342f40, 0x2c34af52, 0x2c352f6d, 0x2c35af7f, + 0x2c362f92, 0x2c36832d, 0x2c372f9f, 0x2c37afb1, 0x2c382fd6, 0x2c38afed, + 0x2c392ffb, 0x2c39b00b, 0x2c3a301d, 0x2c3ab031, 0x2c3b3042, 0x2c3bb061, + 0x2c3c12fa, 0x2c3c9310, 0x2c3d3075, 0x2c3d9329, 0x2c3e3092, 0x2c3eb0a0, + 0x2c3f30b8, 0x2c3fb0d0, 0x2c4030fa, 0x2c4091fb, 0x2c41310b, 0x2c41b11e, + 0x2c4211c1, 0x2c42b12f, 0x2c430722, 0x2c43b053, 0x2c442fc4, 0x2c44b0dd, + 0x30320000, 0x30328015, 0x3033001f, 0x30338038, 0x3034004a, 0x30348064, + 0x3035006b, 0x30358083, 0x30360094, 0x303680ac, 0x303700b9, 0x303780c8, + 0x303800ea, 0x303880f7, 0x3039010a, 0x30398125, 0x303a013a, 0x303a814e, + 0x303b0162, 0x303b8173, 0x303c018c, 0x303c81a9, 0x303d01b7, 0x303d81cb, + 0x303e01db, 0x303e81f4, 0x303f0204, 0x303f8217, 0x30400226, 0x30408232, + 0x30410247, 0x30418257, 0x3042026e, 0x3042827b, 0x3043028e, 0x3043829d, + 0x304402b2, 0x304482d3, 0x304502e6, 0x304582f9, 0x30460312, 0x3046832d, + 0x3047034a, 0x3047835c, 0x3048036a, 0x3048837b, 0x3049038a, 0x304983a2, + 0x304a03b4, 0x304a83c8, 0x304b03e0, 0x304b83f3, 0x304c03fe, 0x304c840f, + 0x304d041b, 0x304d8431, 0x304e043f, 0x304e8455, 0x304f0467, 0x304f8479, + 0x3050049c, 0x305084af, 0x305104c0, 0x305184d0, 0x305204e8, 0x305284fd, + 0x30530515, 0x30538529, 0x30540541, 0x3054855a, 0x30550573, 0x30558590, + 0x3056059b, 0x305685b3, 0x305705c3, 0x305785d4, 0x305805e7, 0x305885fd, + 0x30590606, 0x3059861b, 0x305a062e, 0x305a863d, 0x305b065d, 0x305b866c, + 0x305c068d, 0x305c86a9, 0x305d06b5, 0x305d86d5, 0x305e06f1, 0x305e8702, + 0x305f0718, 0x305f8722, 0x3060048c, 0x34320b65, 0x34328b79, 0x34330b96, + 0x34338ba9, 0x34340bb8, 0x34348bf1, 0x34350bd5, 0x3c320083, 0x3c328cbc, + 0x3c330cd5, 0x3c338cf0, 0x3c340d0d, 0x3c348d37, 0x3c350d52, 0x3c358d78, + 0x3c360d91, 0x3c368da9, 0x3c370dba, 0x3c378dc8, 0x3c380dd5, 0x3c388de9, + 0x3c390c7f, 0x3c398e0c, 0x3c3a0e20, 0x3c3a890f, 0x3c3b0e30, 0x3c3b8e4b, + 0x3c3c0e5d, 0x3c3c8e90, 0x3c3d0e9a, 0x3c3d8eae, 0x3c3e0ebc, 0x3c3e8ee1, + 0x3c3f0ca8, 0x3c3f8eca, 0x3c4000ac, 0x3c4080ea, 0x3c410d28, 0x3c418d67, + 0x3c420e73, 0x3c428dfd, 0x40321971, 0x40329987, 0x403319b5, 0x403399bf, + 0x403419d6, 0x403499f4, 0x40351a04, 0x40359a16, 0x40361a23, 0x40369a2f, + 0x40371a44, 0x40379a56, 0x40381a61, 0x40389a73, 0x40390f27, 0x40399a83, + 0x403a1a96, 0x403a9ab7, 0x403b1ac8, 0x403b9ad8, 0x403c0064, 0x403c8083, + 0x403d1b5c, 0x403d9b72, 0x403e1b81, 0x403e9bb9, 0x403f1bd3, 0x403f9bfb, + 0x40401c10, 0x40409c24, 0x40411c41, 0x40419c5c, 0x40421c75, 0x40429c88, + 0x40431c9c, 0x40439cb4, 0x40441ccb, 0x404480ac, 0x40451ce0, 0x40459cf2, + 0x40461d16, 0x40469d36, 0x40471d44, 0x40479d6b, 0x40481ddc, 0x40489e0f, + 0x40491e26, 0x40499e40, 0x404a1e57, 0x404a9e75, 0x404b1e8d, 0x404b9ea4, + 0x404c1eba, 0x404c9ecc, 0x404d1eed, 0x404d9f26, 0x404e1f3a, 0x404e9f47, + 0x404f1f8e, 0x404f9fd4, 0x4050202b, 0x4050a03f, 0x40512072, 0x40522082, + 0x4052a0a6, 0x405320be, 0x4053a0d1, 0x405420e6, 0x4054a109, 0x40552117, + 0x4055a154, 0x40562161, 0x4056a17a, 0x40572192, 0x4057a1a5, 0x405821ba, + 0x4058a1e1, 0x40592210, 0x4059a23d, 0x405a2251, 0x405aa261, 0x405b2279, + 0x405ba28a, 0x405c229d, 0x405ca2dc, 0x405d22e9, 0x405da30e, 0x405e234c, + 0x405e8ab3, 0x405f236d, 0x405fa37a, 0x40602388, 0x4060a3aa, 0x4061240b, + 0x4061a443, 0x4062245a, 0x4062a46b, 0x40632490, 0x4063a4a5, 0x406424bc, + 0x4064a4e8, 0x40652503, 0x4065a51a, 0x40662532, 0x4066a55c, 0x40672587, + 0x4067a5cc, 0x40682614, 0x4068a635, 0x40692667, 0x4069a695, 0x406a26b6, + 0x406aa6d6, 0x406b285e, 0x406ba881, 0x406c2897, 0x406cab3a, 0x406d2b69, + 0x406dab91, 0x406e2bbf, 0x406eac0c, 0x406f2c47, 0x406fac7f, 0x40702c92, + 0x4070acaf, 0x40710802, 0x4071acc1, 0x40722cd4, 0x4072ad0a, 0x40732d22, + 0x407394fa, 0x40742d36, 0x4074ad50, 0x40752d61, 0x4075ad75, 0x40762d83, + 0x407692be, 0x40772da8, 0x4077adca, 0x40782de5, 0x4078ae1e, 0x40792e35, + 0x4079ae4b, 0x407a2e77, 0x407aae8a, 0x407b2e9f, 0x407baeb1, 0x407c2ee2, + 0x407caeeb, 0x407d2650, 0x407d9fe4, 0x407e2dfa, 0x407ea1f1, 0x407f1d58, + 0x407f9afe, 0x40801f9e, 0x40809d80, 0x40812094, 0x40819f78, 0x40822baa, + 0x40829ae4, 0x408321cc, 0x4083a4cd, 0x40841d94, 0x4084a229, 0x408522ae, + 0x4085a3d2, 0x4086232e, 0x40869ffe, 0x40872bf0, 0x4087a420, 0x40881b45, + 0x4088a5df, 0x40891b94, 0x40899b21, 0x408a28cf, 0x408a9912, 0x408b2ec6, + 0x408bac5c, 0x408c22be, 0x408c992e, 0x408d1df5, 0x408d9dc6, 0x408e1f0f, + 0x408ea134, 0x408f25f3, 0x408fa3ee, 0x409025a8, 0x4090a300, 0x409128b7, + 0x40919954, 0x40921be1, 0x4092ac2b, 0x40932ced, 0x4093a00f, 0x40941da8, + 0x4094a8e8, 0x4095247c, 0x4095ae57, 0x40962bd7, 0x40969fb7, 0x4097205a, + 0x40979f5e, 0x41f42789, 0x41f9281b, 0x41fe270e, 0x41fea92b, 0x41ff2a1c, + 0x420327a2, 0x420827c4, 0x4208a800, 0x420926f2, 0x4209a83a, 0x420a2749, + 0x420aa729, 0x420b2769, 0x420ba7e2, 0x420c2a38, 0x420ca8f8, 0x420d2912, + 0x420da949, 0x42122963, 0x421729ff, 0x4217a9a5, 0x421c29c7, 0x421f2982, + 0x42212a4f, 0x422629e2, 0x422b2b1e, 0x422baacc, 0x422c2b06, 0x422caa8b, + 0x422d2a6a, 0x422daaeb, 0x422e2ab1, 0x4432072d, 0x4432873c, 0x44330748, + 0x44338756, 0x44340769, 0x4434877a, 0x44350781, 0x4435878b, 0x4436079e, + 0x443687b4, 0x443707c6, 0x443787d3, 0x443807e2, 0x443887ea, 0x44390802, + 0x44398810, 0x443a0823, 0x483212e8, 0x483292fa, 0x48331310, 0x48339329, + 0x4c32134e, 0x4c32935e, 0x4c331371, 0x4c339391, 0x4c3400ac, 0x4c3480ea, + 0x4c35139d, 0x4c3593ab, 0x4c3613c7, 0x4c3693ed, 0x4c3713fc, 0x4c37940a, + 0x4c38141f, 0x4c38942b, 0x4c39144b, 0x4c399475, 0x4c3a148e, 0x4c3a94a7, + 0x4c3b05fd, 0x4c3b94c0, 0x4c3c14d2, 0x4c3c94e1, 0x4c3d14fa, 0x4c3d8c47, + 0x4c3e1567, 0x4c3e9509, 0x4c3f1589, 0x4c3f92be, 0x4c40151f, 0x4c40933a, + 0x4c411557, 0x4c4193da, 0x4c421543, 0x50323141, 0x5032b150, 0x5033315b, + 0x5033b16b, 0x50343184, 0x5034b19e, 0x503531ac, 0x5035b1c2, 0x503631d4, + 0x5036b1ea, 0x50373203, 0x5037b216, 0x5038322e, 0x5038b23f, 0x50393254, + 0x5039b268, 0x503a3288, 0x503ab29e, 0x503b32b6, 0x503bb2c8, 0x503c32e4, + 0x503cb2fb, 0x503d3314, 0x503db32a, 0x503e3337, 0x503eb34d, 0x503f335f, + 0x503f837b, 0x50403372, 0x5040b382, 0x5041339c, 0x5041b3ab, 0x504233c5, + 0x5042b3e2, 0x504333f2, 0x5043b402, 0x50443411, 0x50448431, 0x50453425, + 0x5045b443, 0x50463456, 0x5046b46c, 0x5047347e, 0x5047b493, 0x504834b9, + 0x5048b4c7, 0x504934da, 0x5049b4ef, 0x504a3505, 0x504ab515, 0x504b3535, + 0x504bb548, 0x504c356b, 0x504cb599, 0x504d35ab, 0x504db5c8, 0x504e35e3, + 0x504eb5ff, 0x504f3611, 0x504fb628, 0x50503637, 0x505086f1, 0x5051364a, + 0x58320f65, 0x68320f27, 0x68328c7f, 0x68330c92, 0x68338f35, 0x68340f45, + 0x683480ea, 0x6c320eed, 0x6c328c36, 0x6c330ef8, 0x6c338f11, 0x74320a1b, + 0x743280ac, 0x74330c47, 0x78320980, 0x78328995, 0x783309a1, 0x78338083, + 0x783409b0, 0x783489c5, 0x783509e4, 0x78358a06, 0x78360a1b, 0x78368a31, + 0x78370a41, 0x78378a62, 0x78380a75, 0x78388a87, 0x78390a94, 0x78398ab3, + 0x783a0ac8, 0x783a8ad6, 0x783b0ae0, 0x783b8af4, 0x783c0b0b, 0x783c8b20, + 0x783d0b37, 0x783d8b4c, 0x783e0aa2, 0x783e8a54, 0x7c3211d7, }; -// clang-format on const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); @@ -774,14 +217,14 @@ const char kOpenSSLReasonStringData[] = "INTEGER_NOT_ASCII_FORMAT\0" "INTEGER_TOO_LARGE_FOR_LONG\0" "INVALID_BIT_STRING_BITS_LEFT\0" - "INVALID_BMPSTRING_LENGTH\0" + "INVALID_BMPSTRING\0" "INVALID_DIGIT\0" "INVALID_MODIFIER\0" "INVALID_NUMBER\0" "INVALID_OBJECT_ENCODING\0" "INVALID_SEPARATOR\0" "INVALID_TIME_FORMAT\0" - "INVALID_UNIVERSALSTRING_LENGTH\0" + "INVALID_UNIVERSALSTRING\0" "INVALID_UTF8STRING\0" "LIST_ERROR\0" "MISSING_ASN1_EOS\0" @@ -792,6 +235,7 @@ const char kOpenSSLReasonStringData[] = "MSTRING_WRONG_TAG\0" "NESTED_ASN1_ERROR\0" "NESTED_ASN1_STRING\0" + "NESTED_TOO_DEEP\0" "NON_HEX_CHARACTERS\0" "NOT_ASCII_FORMAT\0" "NOT_ENOUGH_DATA\0" @@ -899,6 +343,7 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_HASH\0" "BAD_Q_VALUE\0" "BAD_VERSION\0" + "INVALID_PARAMETERS\0" "MISSING_PARAMETERS\0" "NEED_NEW_SETUP_VALUES\0" "BIGNUM_OUT_OF_RANGE\0" @@ -932,6 +377,7 @@ const char kOpenSSLReasonStringData[] = "WRONG_ORDER\0" "KDF_FAILED\0" "POINT_ARITHMETIC_FAILURE\0" + "UNKNOWN_DIGEST_LENGTH\0" "BAD_SIGNATURE\0" "NOT_IMPLEMENTED\0" "RANDOM_NUMBER_GENERATION_FAILED\0" @@ -948,12 +394,13 @@ const char kOpenSSLReasonStringData[] = "INVALID_KEYBITS\0" "INVALID_MGF1_MD\0" "INVALID_PADDING_MODE\0" - "INVALID_PARAMETERS\0" + "INVALID_PEER_KEY\0" "INVALID_PSS_SALTLEN\0" "INVALID_SIGNATURE\0" "KEYS_NOT_SET\0" "MEMORY_LIMIT_EXCEEDED\0" "NOT_A_PRIVATE_KEY\0" + "NOT_XOF_OR_INVALID_LENGTH\0" "NO_DEFAULT_DIGEST\0" "NO_KEY_SET\0" "NO_MDC2_SUPPORT\0" @@ -993,6 +440,7 @@ const char kOpenSSLReasonStringData[] = "ENCRYPT_ERROR\0" "ERROR_SETTING_CIPHER_PARAMS\0" "INCORRECT_PASSWORD\0" + "INVALID_CHARACTERS\0" "KEYGEN_FAILURE\0" "KEY_GEN_ERROR\0" "METHOD_NOT_SUPPORTED\0" @@ -1008,6 +456,7 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_DIGEST\0" "UNSUPPORTED_KEYLENGTH\0" "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0" + "UNSUPPORTED_OPTIONS\0" "UNSUPPORTED_PRF\0" "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0" "UNSUPPORTED_SALT_TYPE\0" @@ -1016,6 +465,7 @@ const char kOpenSSLReasonStringData[] = "BAD_PAD_BYTE_COUNT\0" "BAD_RSA_PARAMETERS\0" "BLOCK_TYPE_IS_NOT_01\0" + "BLOCK_TYPE_IS_NOT_02\0" "BN_NOT_INITIALIZED\0" "CANNOT_RECOVER_MULTI_PRIME_KEY\0" "CRT_PARAMS_ALREADY_GIVEN\0" @@ -1028,6 +478,7 @@ const char kOpenSSLReasonStringData[] = "DATA_TOO_SMALL_FOR_KEY_SIZE\0" "DIGEST_TOO_BIG_FOR_RSA_KEY\0" "D_E_NOT_CONGRUENT_TO_1\0" + "D_OUT_OF_RANGE\0" "EMPTY_PUBLIC_KEY\0" "FIRST_OCTET_INVALID\0" "INCONSISTENT_SET_OF_CRT_VALUES\0" @@ -1052,6 +503,7 @@ const char kOpenSSLReasonStringData[] = "WRONG_SIGNATURE_LENGTH\0" "ALPN_MISMATCH_ON_EARLY_DATA\0" "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0" + "APPLICATION_DATA_ON_SHUTDOWN\0" "APP_DATA_IN_HANDSHAKE\0" "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0" "BAD_ALERT\0" @@ -1081,6 +533,7 @@ const char kOpenSSLReasonStringData[] = "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0" "CERTIFICATE_VERIFY_FAILED\0" "CERT_CB_ERROR\0" + "CERT_DECOMPRESSION_FAILED\0" "CERT_LENGTH_MISMATCH\0" "CHANNEL_ID_NOT_P256\0" "CHANNEL_ID_SIGNATURE_INVALID\0" @@ -1100,6 +553,8 @@ const char kOpenSSLReasonStringData[] = "DTLS_MESSAGE_TOO_BIG\0" "DUPLICATE_EXTENSION\0" "DUPLICATE_KEY_SHARE\0" + "DUPLICATE_SIGNATURE_ALGORITHM\0" + "EARLY_DATA_NOT_IN_USE\0" "ECC_CERT_NOT_FOR_SIGNING\0" "EMPTY_HELLO_RETRY_REQUEST\0" "EMS_STATE_INCONSISTENT\0" @@ -1112,17 +567,22 @@ const char kOpenSSLReasonStringData[] = "FRAGMENT_MISMATCH\0" "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0" "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0" + "HANDSHAKE_NOT_COMPLETE\0" "HTTPS_PROXY_REQUEST\0" "HTTP_REQUEST\0" "INAPPROPRIATE_FALLBACK\0" + "INCONSISTENT_CLIENT_HELLO\0" "INVALID_ALPN_PROTOCOL\0" "INVALID_COMMAND\0" "INVALID_COMPRESSION_LIST\0" + "INVALID_DELEGATED_CREDENTIAL\0" "INVALID_MESSAGE\0" "INVALID_OUTER_RECORD_TYPE\0" "INVALID_SCT_LIST\0" + "INVALID_SIGNATURE_ALGORITHM\0" "INVALID_SSL_SESSION\0" "INVALID_TICKET_KEYS_LENGTH\0" + "KEY_USAGE_BIT_INCORRECT\0" "LENGTH_MISMATCH\0" "MISSING_EXTENSION\0" "MISSING_KEY_SHARE\0" @@ -1132,6 +592,7 @@ const char kOpenSSLReasonStringData[] = "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0" "MTU_TOO_SMALL\0" "NEGOTIATED_BOTH_NPN_AND_ALPN\0" + "NEGOTIATED_TB_WITHOUT_EMS_OR_RI\0" "NESTED_GROUP\0" "NO_CERTIFICATES_RETURNED\0" "NO_CERTIFICATE_ASSIGNED\0" @@ -1153,6 +614,7 @@ const char kOpenSSLReasonStringData[] = "NO_SUPPORTED_VERSIONS_ENABLED\0" "NULL_SSL_CTX\0" "NULL_SSL_METHOD_PASSED\0" + "OCSP_CB_ERROR\0" "OLD_SESSION_CIPHER_NOT_RETURNED\0" "OLD_SESSION_PRF_HASH_MISMATCH\0" "OLD_SESSION_VERSION_NOT_RETURNED\0" @@ -1161,11 +623,13 @@ const char kOpenSSLReasonStringData[] = "PEER_DID_NOT_RETURN_A_CERTIFICATE\0" "PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE\0" "PRE_SHARED_KEY_MUST_BE_LAST\0" + "PRIVATE_KEY_OPERATION_FAILED\0" "PROTOCOL_IS_SHUTDOWN\0" "PSK_IDENTITY_BINDER_COUNT_MISMATCH\0" "PSK_IDENTITY_NOT_FOUND\0" "PSK_NO_CLIENT_CB\0" "PSK_NO_SERVER_CB\0" + "QUIC_INTERNAL_ERROR\0" "READ_TIMEOUT_EXPIRED\0" "RECORD_LENGTH_MISMATCH\0" "RECORD_TOO_LARGE\0" @@ -1176,8 +640,10 @@ const char kOpenSSLReasonStringData[] = "RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION\0" "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0" "SCSV_RECEIVED_WHEN_RENEGOTIATING\0" + "SECOND_SERVERHELLO_VERSION_MISMATCH\0" "SERVERHELLO_TLSEXT\0" "SERVER_CERT_CHANGED\0" + "SERVER_ECHOED_INVALID_SESSION_ID\0" "SESSION_ID_CONTEXT_UNINITIALIZED\0" "SESSION_MAY_NOT_BE_CREATED\0" "SHUTDOWN_WHILE_IN_INIT\0" @@ -1200,7 +666,9 @@ const char kOpenSSLReasonStringData[] = "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0" "SSL_HANDSHAKE_FAILURE\0" "SSL_SESSION_ID_CONTEXT_TOO_LONG\0" + "SSL_SESSION_ID_TOO_LONG\0" "TICKET_ENCRYPTION_FAILED\0" + "TLS13_DOWNGRADE\0" "TLSV1_ALERT_ACCESS_DENIED\0" "TLSV1_ALERT_DECODE_ERROR\0" "TLSV1_ALERT_DECRYPTION_FAILED\0" @@ -1229,6 +697,7 @@ const char kOpenSSLReasonStringData[] = "TOO_MUCH_READ_EARLY_DATA\0" "TOO_MUCH_SKIPPED_EARLY_DATA\0" "UNABLE_TO_FIND_ECDH_PARAMETERS\0" + "UNCOMPRESSED_CERT_TOO_LARGE\0" "UNEXPECTED_EXTENSION\0" "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0" "UNEXPECTED_MESSAGE\0" @@ -1236,6 +705,7 @@ const char kOpenSSLReasonStringData[] = "UNEXPECTED_RECORD\0" "UNKNOWN_ALERT_TYPE\0" "UNKNOWN_CERTIFICATE_TYPE\0" + "UNKNOWN_CERT_COMPRESSION_ALG\0" "UNKNOWN_CIPHER_RETURNED\0" "UNKNOWN_CIPHER_TYPE\0" "UNKNOWN_KEY_EXCHANGE_TYPE\0" @@ -1250,6 +720,7 @@ const char kOpenSSLReasonStringData[] = "WRONG_CERTIFICATE_TYPE\0" "WRONG_CIPHER_RETURNED\0" "WRONG_CURVE\0" + "WRONG_ENCRYPTION_LEVEL_RECEIVED\0" "WRONG_MESSAGE_TYPE\0" "WRONG_SIGNATURE_TYPE\0" "WRONG_SSL_VERSION\0" @@ -1282,6 +753,7 @@ const char kOpenSSLReasonStringData[] = "PUBLIC_KEY_DECODE_ERROR\0" "PUBLIC_KEY_ENCODE_ERROR\0" "SHOULD_RETRY\0" + "SIGNATURE_ALGORITHM_MISMATCH\0" "UNKNOWN_KEY_TYPE\0" "UNKNOWN_PURPOSE_ID\0" "UNKNOWN_TRUST_ID\0" diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S new file mode 100644 index 0000000000..b14466ddd7 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S @@ -0,0 +1,1982 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + + + +.section __TEXT,__const + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl _ChaCha20_ctr32 +.private_extern _ChaCha20_ctr32 + +.align 5 +_ChaCha20_ctr32: + cbz x2,Labort +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P +#else + adrp x5,_OPENSSL_armcap_P@PAGE +#endif + cmp x2,#192 + b.lo Lshort + ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +Lshort: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __ARMEB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +Labort: + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + + + +.align 5 +ChaCha20_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + + +.align 5 +ChaCha20_512_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S new file mode 100644 index 0000000000..dc2d6e432c --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S @@ -0,0 +1,772 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +#if __ARM_MAX_ARCH__>=7 +.text + +.section __TEXT,__const +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key + +.align 5 +_aes_hw_set_encrypt_key: +Lenc_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-1 + cmp x0,#0 + b.eq Lenc_key_abort + cmp x2,#0 + b.eq Lenc_key_abort + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon@PAGE + add x3,x3,Lrcon@PAGEOFF + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + b.eq L192 + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +.align 4 +L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b Ldone + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key + +.align 5 +_aes_hw_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl Lenc_key + + cmp x0,#0 + b.ne Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +Ldec_key_abort: + ldp x29,x30,[sp],#16 + ret + +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt + +.align 5 +_aes_hw_encrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt + +.align 5 +_aes_hw_decrypt: + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt + +.align 5 +_aes_hw_cbc_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b Lcbc_done + +.align 5 +Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b Lenter_cbc_enc128 +Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b Lcbc_done +.align 5 +Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v18.16b + // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq Lcbc_done + nop + +Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b Lcbc_done + +Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +Lcbc_done: + st1 {v6.16b},[x4] +Lcbc_abort: + ldr x29,[sp],#16 + ret + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks + +.align 5 +_aes_hw_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo +#ifndef __ARMEB__ + rev w8, w8 +#endif + orr v1.16b,v0.16b,v0.16b + add w10, w8, #1 + orr v18.16b,v0.16b,v0.16b + add w8, w8, #2 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v1.s[3],w10 + b.ls Lctr32_tail + rev w12, w8 + sub x2,x2,#3 // bias + mov v18.s[3],w12 + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + orr v0.16b,v6.16b,v6.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + orr v18.16b,v6.16b,v6.16b + add w9,w8,#1 + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + eor v19.16b,v19.16b,v7.16b + rev w9,w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + mov v0.s[3], w9 + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + mov v1.s[3], w10 + rev w12,w8 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + mov v18.s[3], w12 + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S new file mode 100644 index 0000000000..3d83f4d8d6 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/armv8-mont.S @@ -0,0 +1,1420 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.globl _bn_mul_mont +.private_extern _bn_mul_mont + +.align 5 +_bn_mul_mont: + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + + +.align 5 +__bn_sqr8x_mont: + cmp x1,x2 + b.ne __bn_mul4x_mont +Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret + + +.align 5 +__bn_mul4x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S new file mode 100644 index 0000000000..60bff31018 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S @@ -0,0 +1,338 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.globl _gcm_init_neon +.private_extern _gcm_init_neon + +.align 4 +_gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon + +.align 4 +_gcm_gmult_neon: + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon + +.align 4 +_gcm_ghash_neon: + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section __TEXT,__const +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S new file mode 100644 index 0000000000..be0e283c36 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S @@ -0,0 +1,246 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text + +.globl _gcm_init_v8 +.private_extern _gcm_init_v8 + +.align 4 +_gcm_init_v8: + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] + + ret + +.globl _gcm_gmult_v8 +.private_extern _gcm_gmult_v8 + +.align 4 +_gcm_gmult_v8: + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.globl _gcm_ghash_v8 +.private_extern _gcm_ghash_v8 + +.align 4 +_gcm_ghash_v8: + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq Ldone_v8 //is x3 zero? +Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +Ldone_v8: +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S new file mode 100644 index 0000000000..379107efbf --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -0,0 +1,1232 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text + + +.globl _sha1_block_data_order +.private_extern _sha1_block_data_order + +.align 6 +_sha1_block_data_order: +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P +#else + adrp x16,_OPENSSL_armcap_P@PAGE +#endif + ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] + tst w16,#ARMV8_SHA1 + b.ne Lv8_entry + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __ARMEB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __ARMEB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret + + +.align 6 +sha1_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adrp x4,Lconst@PAGE + add x4,x4,Lconst@PAGEOFF + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b +.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + +.long 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + +.long 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret + +.section __TEXT,__const +.align 6 +Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.comm _OPENSSL_armcap_P,4,4 +.private_extern _OPENSSL_armcap_P +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S new file mode 100644 index 0000000000..d6fa5a930d --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -0,0 +1,1210 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif + +.text + + +.globl _sha256_block_data_order +.private_extern _sha256_block_data_order + +.align 6 +_sha256_block_data_order: +#ifndef __KERNEL__ +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P +#else + adrp x16,_OPENSSL_armcap_P@PAGE +#endif + ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] + tst w16,#ARMV8_SHA256 + b.ne Lv8_entry +#endif + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256@PAGE + add x30,x30,LK256@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __ARMEB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret + + +.section __TEXT,__const +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ + +.align 6 +sha256_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256@PAGE + add x3,x3,LK256@PAGEOFF + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#ifndef __KERNEL__ +.comm _OPENSSL_armcap_P,4,4 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S new file mode 100644 index 0000000000..29e122b180 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -0,0 +1,1082 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif + +.text + + +.globl _sha512_block_data_order +.private_extern _sha512_block_data_order + +.align 6 +_sha512_block_data_order: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512@PAGE + add x30,x30,LK512@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __ARMEB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __ARMEB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret + + +.section __TEXT,__const +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#ifndef __KERNEL__ +.comm _OPENSSL_armcap_P,4,4 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S new file mode 100644 index 0000000000..0f5cbeadaf --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/fipsmodule/vpaes-armv8.S @@ -0,0 +1,1213 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section __TEXT,__const + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## + +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## + +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt + +.align 4 +_vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret + + + +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + + + +.align 4 +_vpaes_decrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + adrp x11, Lk_dipt@PAGE + add x11, x11, Lk_dipt@PAGEOFF + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe + ret + + +## +## Decryption core +## +## Same API as encryption core. +## + +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr@PAGE + add x10, x10, Lk_sr@PAGEOFF + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward@PAGE+48 + add x10, x10, Lk_mc_forward@PAGEOFF+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Ldec_entry + +.align 4 +Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret + + +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt + +.align 4 +_vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret + + +// v14-v15 input, v0-v1 output + +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr@PAGE + add x10, x10, Lk_sr@PAGEOFF + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward@PAGE+48 + add x10, x10, Lk_mc_forward@PAGEOFF+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Ldec_2x_entry + +.align 4 +Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## + +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1@PAGE + add x11, x11, Lk_sb1@PAGEOFF + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd@PAGE + add x10, x10, Lk_dksd@PAGEOFF + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward@PAGE + add x11, x11, Lk_mc_forward@PAGEOFF + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + + +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 + add x10, x10, Lk_sr@PAGEOFF + + add x8, x8, x10 + cbnz w3, Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, Lk_deskew@PAGEOFF + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, Lk_opt@PAGEOFF + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## + +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## + +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## + +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## + +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key + +.align 4 +_vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + + +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key + +.align 4 +_vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + +.globl _vpaes_cbc_encrypt +.private_extern _vpaes_cbc_encrypt + +.align 4 +_vpaes_cbc_encrypt: + cbz x2, Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b Lcbc_enc_loop + +.align 4 +Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +Lcbc_abort: + ret + + + +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls Lcbc_dec_done + +.align 4 +Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi Lcbc_dec_loop2x + +Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.align 4 +_vpaes_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S new file mode 100644 index 0000000000..438e9298c0 --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/test/trampoline-armv8.S @@ -0,0 +1,685 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +Labi_test_trampoline_begin: + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x7, [x10], #8 + +Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq Lx29_ok + str xzr, [x1] + +Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + ret + + +.globl _abi_test_clobber_x0 +.private_extern _abi_test_clobber_x0 +.align 4 +_abi_test_clobber_x0: + mov x0, xzr + ret + + +.globl _abi_test_clobber_x1 +.private_extern _abi_test_clobber_x1 +.align 4 +_abi_test_clobber_x1: + mov x1, xzr + ret + + +.globl _abi_test_clobber_x2 +.private_extern _abi_test_clobber_x2 +.align 4 +_abi_test_clobber_x2: + mov x2, xzr + ret + + +.globl _abi_test_clobber_x3 +.private_extern _abi_test_clobber_x3 +.align 4 +_abi_test_clobber_x3: + mov x3, xzr + ret + + +.globl _abi_test_clobber_x4 +.private_extern _abi_test_clobber_x4 +.align 4 +_abi_test_clobber_x4: + mov x4, xzr + ret + + +.globl _abi_test_clobber_x5 +.private_extern _abi_test_clobber_x5 +.align 4 +_abi_test_clobber_x5: + mov x5, xzr + ret + + +.globl _abi_test_clobber_x6 +.private_extern _abi_test_clobber_x6 +.align 4 +_abi_test_clobber_x6: + mov x6, xzr + ret + + +.globl _abi_test_clobber_x7 +.private_extern _abi_test_clobber_x7 +.align 4 +_abi_test_clobber_x7: + mov x7, xzr + ret + + +.globl _abi_test_clobber_x8 +.private_extern _abi_test_clobber_x8 +.align 4 +_abi_test_clobber_x8: + mov x8, xzr + ret + + +.globl _abi_test_clobber_x9 +.private_extern _abi_test_clobber_x9 +.align 4 +_abi_test_clobber_x9: + mov x9, xzr + ret + + +.globl _abi_test_clobber_x10 +.private_extern _abi_test_clobber_x10 +.align 4 +_abi_test_clobber_x10: + mov x10, xzr + ret + + +.globl _abi_test_clobber_x11 +.private_extern _abi_test_clobber_x11 +.align 4 +_abi_test_clobber_x11: + mov x11, xzr + ret + + +.globl _abi_test_clobber_x12 +.private_extern _abi_test_clobber_x12 +.align 4 +_abi_test_clobber_x12: + mov x12, xzr + ret + + +.globl _abi_test_clobber_x13 +.private_extern _abi_test_clobber_x13 +.align 4 +_abi_test_clobber_x13: + mov x13, xzr + ret + + +.globl _abi_test_clobber_x14 +.private_extern _abi_test_clobber_x14 +.align 4 +_abi_test_clobber_x14: + mov x14, xzr + ret + + +.globl _abi_test_clobber_x15 +.private_extern _abi_test_clobber_x15 +.align 4 +_abi_test_clobber_x15: + mov x15, xzr + ret + + +.globl _abi_test_clobber_x16 +.private_extern _abi_test_clobber_x16 +.align 4 +_abi_test_clobber_x16: + mov x16, xzr + ret + + +.globl _abi_test_clobber_x17 +.private_extern _abi_test_clobber_x17 +.align 4 +_abi_test_clobber_x17: + mov x17, xzr + ret + + +.globl _abi_test_clobber_x19 +.private_extern _abi_test_clobber_x19 +.align 4 +_abi_test_clobber_x19: + mov x19, xzr + ret + + +.globl _abi_test_clobber_x20 +.private_extern _abi_test_clobber_x20 +.align 4 +_abi_test_clobber_x20: + mov x20, xzr + ret + + +.globl _abi_test_clobber_x21 +.private_extern _abi_test_clobber_x21 +.align 4 +_abi_test_clobber_x21: + mov x21, xzr + ret + + +.globl _abi_test_clobber_x22 +.private_extern _abi_test_clobber_x22 +.align 4 +_abi_test_clobber_x22: + mov x22, xzr + ret + + +.globl _abi_test_clobber_x23 +.private_extern _abi_test_clobber_x23 +.align 4 +_abi_test_clobber_x23: + mov x23, xzr + ret + + +.globl _abi_test_clobber_x24 +.private_extern _abi_test_clobber_x24 +.align 4 +_abi_test_clobber_x24: + mov x24, xzr + ret + + +.globl _abi_test_clobber_x25 +.private_extern _abi_test_clobber_x25 +.align 4 +_abi_test_clobber_x25: + mov x25, xzr + ret + + +.globl _abi_test_clobber_x26 +.private_extern _abi_test_clobber_x26 +.align 4 +_abi_test_clobber_x26: + mov x26, xzr + ret + + +.globl _abi_test_clobber_x27 +.private_extern _abi_test_clobber_x27 +.align 4 +_abi_test_clobber_x27: + mov x27, xzr + ret + + +.globl _abi_test_clobber_x28 +.private_extern _abi_test_clobber_x28 +.align 4 +_abi_test_clobber_x28: + mov x28, xzr + ret + + +.globl _abi_test_clobber_x29 +.private_extern _abi_test_clobber_x29 +.align 4 +_abi_test_clobber_x29: + mov x29, xzr + ret + + +.globl _abi_test_clobber_d0 +.private_extern _abi_test_clobber_d0 +.align 4 +_abi_test_clobber_d0: + fmov d0, xzr + ret + + +.globl _abi_test_clobber_d1 +.private_extern _abi_test_clobber_d1 +.align 4 +_abi_test_clobber_d1: + fmov d1, xzr + ret + + +.globl _abi_test_clobber_d2 +.private_extern _abi_test_clobber_d2 +.align 4 +_abi_test_clobber_d2: + fmov d2, xzr + ret + + +.globl _abi_test_clobber_d3 +.private_extern _abi_test_clobber_d3 +.align 4 +_abi_test_clobber_d3: + fmov d3, xzr + ret + + +.globl _abi_test_clobber_d4 +.private_extern _abi_test_clobber_d4 +.align 4 +_abi_test_clobber_d4: + fmov d4, xzr + ret + + +.globl _abi_test_clobber_d5 +.private_extern _abi_test_clobber_d5 +.align 4 +_abi_test_clobber_d5: + fmov d5, xzr + ret + + +.globl _abi_test_clobber_d6 +.private_extern _abi_test_clobber_d6 +.align 4 +_abi_test_clobber_d6: + fmov d6, xzr + ret + + +.globl _abi_test_clobber_d7 +.private_extern _abi_test_clobber_d7 +.align 4 +_abi_test_clobber_d7: + fmov d7, xzr + ret + + +.globl _abi_test_clobber_d8 +.private_extern _abi_test_clobber_d8 +.align 4 +_abi_test_clobber_d8: + fmov d8, xzr + ret + + +.globl _abi_test_clobber_d9 +.private_extern _abi_test_clobber_d9 +.align 4 +_abi_test_clobber_d9: + fmov d9, xzr + ret + + +.globl _abi_test_clobber_d10 +.private_extern _abi_test_clobber_d10 +.align 4 +_abi_test_clobber_d10: + fmov d10, xzr + ret + + +.globl _abi_test_clobber_d11 +.private_extern _abi_test_clobber_d11 +.align 4 +_abi_test_clobber_d11: + fmov d11, xzr + ret + + +.globl _abi_test_clobber_d12 +.private_extern _abi_test_clobber_d12 +.align 4 +_abi_test_clobber_d12: + fmov d12, xzr + ret + + +.globl _abi_test_clobber_d13 +.private_extern _abi_test_clobber_d13 +.align 4 +_abi_test_clobber_d13: + fmov d13, xzr + ret + + +.globl _abi_test_clobber_d14 +.private_extern _abi_test_clobber_d14 +.align 4 +_abi_test_clobber_d14: + fmov d14, xzr + ret + + +.globl _abi_test_clobber_d15 +.private_extern _abi_test_clobber_d15 +.align 4 +_abi_test_clobber_d15: + fmov d15, xzr + ret + + +.globl _abi_test_clobber_d16 +.private_extern _abi_test_clobber_d16 +.align 4 +_abi_test_clobber_d16: + fmov d16, xzr + ret + + +.globl _abi_test_clobber_d17 +.private_extern _abi_test_clobber_d17 +.align 4 +_abi_test_clobber_d17: + fmov d17, xzr + ret + + +.globl _abi_test_clobber_d18 +.private_extern _abi_test_clobber_d18 +.align 4 +_abi_test_clobber_d18: + fmov d18, xzr + ret + + +.globl _abi_test_clobber_d19 +.private_extern _abi_test_clobber_d19 +.align 4 +_abi_test_clobber_d19: + fmov d19, xzr + ret + + +.globl _abi_test_clobber_d20 +.private_extern _abi_test_clobber_d20 +.align 4 +_abi_test_clobber_d20: + fmov d20, xzr + ret + + +.globl _abi_test_clobber_d21 +.private_extern _abi_test_clobber_d21 +.align 4 +_abi_test_clobber_d21: + fmov d21, xzr + ret + + +.globl _abi_test_clobber_d22 +.private_extern _abi_test_clobber_d22 +.align 4 +_abi_test_clobber_d22: + fmov d22, xzr + ret + + +.globl _abi_test_clobber_d23 +.private_extern _abi_test_clobber_d23 +.align 4 +_abi_test_clobber_d23: + fmov d23, xzr + ret + + +.globl _abi_test_clobber_d24 +.private_extern _abi_test_clobber_d24 +.align 4 +_abi_test_clobber_d24: + fmov d24, xzr + ret + + +.globl _abi_test_clobber_d25 +.private_extern _abi_test_clobber_d25 +.align 4 +_abi_test_clobber_d25: + fmov d25, xzr + ret + + +.globl _abi_test_clobber_d26 +.private_extern _abi_test_clobber_d26 +.align 4 +_abi_test_clobber_d26: + fmov d26, xzr + ret + + +.globl _abi_test_clobber_d27 +.private_extern _abi_test_clobber_d27 +.align 4 +_abi_test_clobber_d27: + fmov d27, xzr + ret + + +.globl _abi_test_clobber_d28 +.private_extern _abi_test_clobber_d28 +.align 4 +_abi_test_clobber_d28: + fmov d28, xzr + ret + + +.globl _abi_test_clobber_d29 +.private_extern _abi_test_clobber_d29 +.align 4 +_abi_test_clobber_d29: + fmov d29, xzr + ret + + +.globl _abi_test_clobber_d30 +.private_extern _abi_test_clobber_d30 +.align 4 +_abi_test_clobber_d30: + fmov d30, xzr + ret + + +.globl _abi_test_clobber_d31 +.private_extern _abi_test_clobber_d31 +.align 4 +_abi_test_clobber_d31: + fmov d31, xzr + ret + + +.globl _abi_test_clobber_v8_upper +.private_extern _abi_test_clobber_v8_upper +.align 4 +_abi_test_clobber_v8_upper: + fmov v8.d[1], xzr + ret + + +.globl _abi_test_clobber_v9_upper +.private_extern _abi_test_clobber_v9_upper +.align 4 +_abi_test_clobber_v9_upper: + fmov v9.d[1], xzr + ret + + +.globl _abi_test_clobber_v10_upper +.private_extern _abi_test_clobber_v10_upper +.align 4 +_abi_test_clobber_v10_upper: + fmov v10.d[1], xzr + ret + + +.globl _abi_test_clobber_v11_upper +.private_extern _abi_test_clobber_v11_upper +.align 4 +_abi_test_clobber_v11_upper: + fmov v11.d[1], xzr + ret + + +.globl _abi_test_clobber_v12_upper +.private_extern _abi_test_clobber_v12_upper +.align 4 +_abi_test_clobber_v12_upper: + fmov v12.d[1], xzr + ret + + +.globl _abi_test_clobber_v13_upper +.private_extern _abi_test_clobber_v13_upper +.align 4 +_abi_test_clobber_v13_upper: + fmov v13.d[1], xzr + ret + + +.globl _abi_test_clobber_v14_upper +.private_extern _abi_test_clobber_v14_upper +.align 4 +_abi_test_clobber_v14_upper: + fmov v14.d[1], xzr + ret + + +.globl _abi_test_clobber_v15_upper +.private_extern _abi_test_clobber_v15_upper +.align 4 +_abi_test_clobber_v15_upper: + fmov v15.d[1], xzr + ret + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S b/packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S new file mode 100644 index 0000000000..c48863f65f --- /dev/null +++ b/packager/third_party/boringssl/ios-aarch64/crypto/third_party/sike/asm/fp-armv8.S @@ -0,0 +1,996 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section __TEXT,__const + +# p434 x 2 +Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 + +# p434 + 1 +Lp434p1: +.quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056, 0x0002341F27177344 + +.text +.globl _sike_mpmul +.private_extern _sike_mpmul +.align 4 +_sike_mpmul: + stp x29, x30, [sp,#-96]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + ldp x14, x15, [x1,#32] + ldr x16, [x1,#48] + + // x3-x7 <- AH + AL, x7 <- carry + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, xzr + adc x7, xzr, xzr + + // x10-x13 <- BH + BL, x8 <- carry + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, xzr + adc x8, xzr, xzr + + // x9 <- combined carry + and x9, x7, x8 + // x7-x8 <- mask + sub x7, xzr, x7 + sub x8, xzr, x8 + + // x15-x19 <- masked (BH + BL) + and x14, x10, x7 + and x15, x11, x7 + and x16, x12, x7 + and x17, x13, x7 + + // x20-x23 <- masked (AH + AL) + and x20, x3, x8 + and x21, x4, x8 + and x22, x5, x8 + and x23, x6, x8 + + // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x14, x14, x20 + adcs x15, x15, x21 + adcs x16, x16, x22 + adcs x17, x17, x23 + adc x7, x9, xzr + + // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part + stp x3, x4, [x2,#0] + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x25, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x23, x10, x12 + adcs x26, x11, x13 + adc x24, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x19, xzr, x25 + sub x20, xzr, x24 + and x8, x23, x19 + and x9, x26, x19 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x21, x3, x20 + and x22, x4, x20 + mul x19, x3, x23 + mul x20, x3, x26 + and x25, x25, x24 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x8, x21, x8 + umulh x21, x3, x26 + adcs x9, x22, x9 + umulh x22, x3, x23 + adc x25, x25, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x23 + umulh x23, x4, x23 + adds x20, x20, x22 + adc x21, x21, xzr + + mul x24, x4, x26 + umulh x26, x4, x26 + adds x20, x20, x3 + adcs x21, x21, x23 + adc x22, xzr, xzr + + adds x21, x21, x24 + adc x22, x22, x26 + + ldp x3, x4, [x2,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x21, x8, x21 + umulh x24, x3, x10 + umulh x26, x3, x11 + adcs x22, x9, x22 + mul x8, x3, x10 + mul x9, x3, x11 + adc x25, x25, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x9, x9, x24 + adc x26, x26, xzr + + mul x23, x4, x11 + umulh x11, x4, x11 + adds x9, x9, x3 + adcs x26, x26, x10 + adc x24, xzr, xzr + + adds x26, x26, x23 + adc x24, x24, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x19, x19, x8 + sbcs x20, x20, x9 + sbcs x21, x21, x26 + mul x4, x5, x13 + umulh x23, x5, x13 + sbcs x22, x22, x24 + sbc x25, x25, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x23, x23, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x23, x23, x12 + adc x10, xzr, xzr + + adds x23, x23, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x19, x19, x3 + sbcs x20, x20, x4 + sbcs x21, x21, x23 + sbcs x22, x22, x10 + sbc x25, x25, xzr + + adds x19, x19, x26 + adcs x20, x20, x24 + adcs x21, x21, x3 + adcs x22, x22, x4 + adcs x23, x25, x23 + adc x24, x10, xzr + + + // x15-x19, x7 <- (AH+AL) x (BH+BL), final step + adds x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adc x7, x7, xzr + + // Load AL + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + // Load BL + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + + // Temporarily store x8 in x2 + stp x8, x9, [x2,#0] + // x21-x28 <- AL x BL + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x8, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x27, x10, x12 + adcs x9, x11, x13 + adc x28, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x23, xzr, x8 + sub x24, xzr, x28 + and x21, x27, x23 + and x22, x9, x23 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x25, x3, x24 + and x26, x4, x24 + mul x23, x3, x27 + mul x24, x3, x9 + and x8, x8, x28 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x21, x25, x21 + umulh x25, x3, x9 + adcs x22, x26, x22 + umulh x26, x3, x27 + adc x8, x8, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x27 + umulh x27, x4, x27 + adds x24, x24, x26 + adc x25, x25, xzr + + mul x28, x4, x9 + umulh x9, x4, x9 + adds x24, x24, x3 + adcs x25, x25, x27 + adc x26, xzr, xzr + + adds x25, x25, x28 + adc x26, x26, x9 + + ldp x3, x4, [x0,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x25, x21, x25 + umulh x28, x3, x10 + umulh x9, x3, x11 + adcs x26, x22, x26 + mul x21, x3, x10 + mul x22, x3, x11 + adc x8, x8, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x22, x22, x28 + adc x9, x9, xzr + + mul x27, x4, x11 + umulh x11, x4, x11 + adds x22, x22, x3 + adcs x9, x9, x10 + adc x28, xzr, xzr + + adds x9, x9, x27 + adc x28, x28, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x23, x23, x21 + sbcs x24, x24, x22 + sbcs x25, x25, x9 + mul x4, x5, x13 + umulh x27, x5, x13 + sbcs x26, x26, x28 + sbc x8, x8, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x27, x27, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x27, x27, x12 + adc x10, xzr, xzr + + adds x27, x27, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x23, x23, x3 + sbcs x24, x24, x4 + sbcs x25, x25, x27 + sbcs x26, x26, x10 + sbc x8, x8, xzr + + adds x23, x23, x9 + adcs x24, x24, x28 + adcs x25, x25, x3 + adcs x26, x26, x4 + adcs x27, x8, x27 + adc x28, x10, xzr + + // Restore x8 + ldp x8, x9, [x2,#0] + + // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + sbc x7, x7, xzr + + // Store ALxBL, low + stp x21, x22, [x2] + stp x23, x24, [x2,#16] + + // Load AH + ldp x3, x4, [x0,#32] + ldr x5, [x0,#48] + // Load BH + ldp x10, x11, [x1,#32] + ldr x12, [x1,#48] + + adds x8, x8, x25 + adcs x9, x9, x26 + adcs x19, x19, x27 + adcs x20, x20, x28 + adc x1, xzr, xzr + + add x0, x0, #32 + // Temporarily store x8,x9 in x2 + stp x8,x9, [x2,#32] + // x21-x28 <- AH x BH + + // A0 * B0 + mul x21, x3, x10 // C0 + umulh x24, x3, x10 + + // A0 * B1 + mul x22, x3, x11 + umulh x23, x3, x11 + + // A1 * B0 + mul x8, x4, x10 + umulh x9, x4, x10 + adds x22, x22, x24 + adc x23, x23, xzr + + // A0 * B2 + mul x27, x3, x12 + umulh x28, x3, x12 + adds x22, x22, x8 // C1 + adcs x23, x23, x9 + adc x24, xzr, xzr + + // A2 * B0 + mul x8, x5, x10 + umulh x25, x5, x10 + adds x23, x23, x27 + adcs x24, x24, x25 + adc x25, xzr, xzr + + // A1 * B1 + mul x27, x4, x11 + umulh x9, x4, x11 + adds x23, x23, x8 + adcs x24, x24, x28 + adc x25, x25, xzr + + // A1 * B2 + mul x8, x4, x12 + umulh x28, x4, x12 + adds x23, x23, x27 // C2 + adcs x24, x24, x9 + adc x25, x25, xzr + + // A2 * B1 + mul x27, x5, x11 + umulh x9, x5, x11 + adds x24, x24, x8 + adcs x25, x25, x28 + adc x26, xzr, xzr + + // A2 * B2 + mul x8, x5, x12 + umulh x28, x5, x12 + adds x24, x24, x27 // C3 + adcs x25, x25, x9 + adc x26, x26, xzr + + adds x25, x25, x8 // C4 + adc x26, x26, x28 // C5 + + // Restore x8,x9 + ldp x8,x9, [x2,#32] + + neg x1, x1 + + // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, xzr + sbcs x17, x17, xzr + sbc x7, x7, xzr + + // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low + stp x8, x9, [x2,#32] + stp x19, x20, [x2,#48] + + adds x1, x1, #1 + adcs x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adcs x25, x7, x25 + adc x26, x26, xzr + + stp x14, x15, [x2,#64] + stp x16, x17, [x2,#80] + stp x25, x26, [x2,#96] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl _sike_fprdc +.private_extern _sike_fprdc +.align 4 +_sike_fprdc: + stp x29, x30, [sp, #-96]! + add x29, sp, xzr + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + adrp x26, Lp434p1@PAGE + add x26, x26, Lp434p1@PAGEOFF + ldp x23, x24, [x26, #0x0] + ldp x25, x26, [x26,#0x10] + + // a[0-1] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x10, x3, x23 + umulh x11, x3, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x11 + adc x7, xzr, xzr + + mul x10, x3, x24 + umulh x11, x3, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x11 + adc x8, x8, xzr + + mul x10, x3, x25 + umulh x11, x3, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x3, x26 + umulh x28, x3, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x11 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + ldp x10, x11, [x0, #0x18] + ldp x12, x13, [x0, #0x28] + ldp x14, x15, [x0, #0x38] + ldp x16, x17, [x0, #0x48] + ldp x19, x20, [x0, #0x58] + ldr x21, [x0, #0x68] + + adds x10, x10, x4 + adcs x11, x11, x5 + adcs x12, x12, x6 + adcs x13, x13, x7 + adcs x14, x14, x8 + adcs x15, x15, x9 + adcs x22, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + ldr x2, [x0,#0x10] // a[2] + // a[2-3] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x0, x10, x23 + umulh x3, x10, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x0 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x0, x10, x24 + umulh x3, x10, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x0 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x0, x10, x25 + umulh x3, x10, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x10, x26 + umulh x28, x10, x26 + adds x7, x7, x0 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + adds x12, x12, x4 + adcs x13, x13, x5 + adcs x14, x14, x6 + adcs x15, x15, x7 + adcs x16, x22, x8 + adcs x17, x17, x9 + adcs x22, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + mul x4, x11, x23 // C0 + umulh x7, x11, x23 + + mul x5, x11, x24 + umulh x6, x11, x24 + + mul x10, x12, x23 + umulh x3, x12, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x11, x25 + umulh x28, x11, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x10, x12, x24 + umulh x3, x12, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x11, x26 + umulh x28, x11, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x10, x12, x25 + umulh x3, x12, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x12, x26 + umulh x28, x12, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + adds x14, x14, x4 + adcs x15, x15, x5 + adcs x16, x16, x6 + adcs x17, x17, x7 + adcs x19, x22, x8 + adcs x20, x20, x9 + adc x22, x21, xzr + + stp x14, x15, [x1, #0x0] // C0, C1 + + mul x4, x13, x23 // C0 + umulh x10, x13, x23 + + mul x5, x13, x24 + umulh x27, x13, x24 + adds x5, x5, x10 // C1 + adc x10, xzr, xzr + + mul x6, x13, x25 + umulh x28, x13, x25 + adds x27, x10, x27 + adcs x6, x6, x27 // C2 + adc x10, xzr, xzr + + mul x7, x13, x26 + umulh x8, x13, x26 + adds x28, x10, x28 + adcs x7, x7, x28 // C3 + adc x8, x8, xzr // C4 + + adds x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + adcs x20, x20, x7 + adc x21, x22, x8 + + str x16, [x1, #0x10] + stp x17, x19, [x1, #0x18] + stp x20, x21, [x1, #0x28] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl _sike_fpadd +.private_extern _sike_fpadd +.align 4 +_sike_fpadd: + stp x29,x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + // Subtract 2xp434 + adrp x17, Lp434x2@PAGE + add x17, x17, Lp434x2@PAGEOFF + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbc x0, xzr, xzr // x0 can be reused now + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_fpsub +.private_extern _sike_fpsub +.align 4 +_sike_fpsub: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbc x0, xzr, xzr + + // Add 2xp434 anded with the mask in x0 + adrp x17, Lp434x2@PAGE + add x17, x17, Lp434x2@PAGEOFF + + // First half + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_mpadd_asm +.private_extern _sike_mpadd_asm +.align 4 +_sike_mpadd_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_mpsubx2_asm +.private_extern _sike_mpsubx2_asm +.align 4 +_sike_mpsubx2_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + sbcs x10, x10, x14 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x11, x12, [x1,#96] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +.globl _sike_mpdblsubx2_asm +.private_extern _sike_mpdblsubx2_asm +.align 4 +_sike_mpdblsubx2_asm: + stp x29, x30, [sp, #-16]! + add x29, sp, #0 + + ldp x3, x4, [x2, #0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + + ldp x11, x12, [x0, #0] + ldp x13, x14, [x0,#16] + ldp x15, x16, [x0,#32] + + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + + // x9 stores carry + adc x9, xzr, xzr + + ldp x11, x12, [x1, #0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2, #0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + + ldp x3, x4, [x2,#48] + ldp x5, x6, [x2,#64] + ldp x7, x8, [x2,#80] + + ldp x11, x12, [x0,#48] + ldp x13, x14, [x0,#64] + ldp x15, x16, [x0,#80] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, xzr, xzr + + ldp x11, x12, [x1,#48] + ldp x13, x14, [x1,#64] + ldp x15, x16, [x1,#80] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2,#48] + stp x5, x6, [x2,#64] + stp x7, x8, [x2,#80] + + ldp x3, x4, [x2,#96] + ldp x11, x12, [x0,#96] + ldp x13, x14, [x1,#96] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + subs x3, x3, x13 + sbc x4, x4, x14 + stp x3, x4, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S new file mode 100644 index 0000000000..cadf2b623b --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/chacha/chacha-armv4.S @@ -0,0 +1,1498 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. + + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +Lone: +.long 1,0,0,0 +#if __ARM_MAX_ARCH__>=7 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-LChaCha20_ctr32 +#else +.word -1 +#endif + +.globl _ChaCha20_ctr32 +.private_extern _ChaCha20_ctr32 +#ifdef __thumb2__ +.thumb_func _ChaCha20_ctr32 +#endif +.align 5 +_ChaCha20_ctr32: +LChaCha20_ctr32: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r14,pc,#16 @ _ChaCha20_ctr32 +#else + adr r14,LChaCha20_ctr32 +#endif + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq Lno_data +#if __ARM_MAX_ARCH__>=7 + cmp r2,#192 @ test len + bls Lshort + ldr r4,[r14,#-32] + ldr r4,[r14,r4] +# ifdef __APPLE__ + ldr r4,[r4] +# endif + tst r4,#ARMV7_NEON + bne LChaCha20_neon +Lshort: +#endif + ldmia r12,{r4,r5,r6,r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + sub r14,r14,#64 @ Lsigma + stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + ldmia r14,{r0,r1,r2,r3} @ load sigma + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key + stmdb sp!,{r0,r1,r2,r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + b Loop_outer_enter + +.align 4 +Loop_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + str r11,[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +Loop_outer_enter: + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(16+15)] + mov r11,#10 + b Loop + +.align 4 +Loop: + subs r11,r11,#1 + add r0,r0,r4 + mov r12,r12,ror#16 + add r1,r1,r5 + mov r10,r10,ror#16 + eor r12,r12,r0,ror#16 + eor r10,r10,r1,ror#16 + add r8,r8,r12 + mov r4,r4,ror#20 + add r9,r9,r10 + mov r5,r5,ror#20 + eor r4,r4,r8,ror#20 + eor r5,r5,r9,ror#20 + add r0,r0,r4 + mov r12,r12,ror#24 + add r1,r1,r5 + mov r10,r10,ror#24 + eor r12,r12,r0,ror#24 + eor r10,r10,r1,ror#24 + add r8,r8,r12 + mov r4,r4,ror#25 + add r9,r9,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+13)] + ldr r10,[sp,#4*(16+15)] + eor r4,r4,r8,ror#25 + eor r5,r5,r9,ror#25 + str r8,[sp,#4*(16+8)] + ldr r8,[sp,#4*(16+10)] + add r2,r2,r6 + mov r14,r14,ror#16 + str r9,[sp,#4*(16+9)] + ldr r9,[sp,#4*(16+11)] + add r3,r3,r7 + mov r10,r10,ror#16 + eor r14,r14,r2,ror#16 + eor r10,r10,r3,ror#16 + add r8,r8,r14 + mov r6,r6,ror#20 + add r9,r9,r10 + mov r7,r7,ror#20 + eor r6,r6,r8,ror#20 + eor r7,r7,r9,ror#20 + add r2,r2,r6 + mov r14,r14,ror#24 + add r3,r3,r7 + mov r10,r10,ror#24 + eor r14,r14,r2,ror#24 + eor r10,r10,r3,ror#24 + add r8,r8,r14 + mov r6,r6,ror#25 + add r9,r9,r10 + mov r7,r7,ror#25 + eor r6,r6,r8,ror#25 + eor r7,r7,r9,ror#25 + add r0,r0,r5 + mov r10,r10,ror#16 + add r1,r1,r6 + mov r12,r12,ror#16 + eor r10,r10,r0,ror#16 + eor r12,r12,r1,ror#16 + add r8,r8,r10 + mov r5,r5,ror#20 + add r9,r9,r12 + mov r6,r6,ror#20 + eor r5,r5,r8,ror#20 + eor r6,r6,r9,ror#20 + add r0,r0,r5 + mov r10,r10,ror#24 + add r1,r1,r6 + mov r12,r12,ror#24 + eor r10,r10,r0,ror#24 + eor r12,r12,r1,ror#24 + add r8,r8,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+15)] + ldr r10,[sp,#4*(16+13)] + add r9,r9,r12 + mov r6,r6,ror#25 + eor r5,r5,r8,ror#25 + eor r6,r6,r9,ror#25 + str r8,[sp,#4*(16+10)] + ldr r8,[sp,#4*(16+8)] + add r2,r2,r7 + mov r10,r10,ror#16 + str r9,[sp,#4*(16+11)] + ldr r9,[sp,#4*(16+9)] + add r3,r3,r4 + mov r14,r14,ror#16 + eor r10,r10,r2,ror#16 + eor r14,r14,r3,ror#16 + add r8,r8,r10 + mov r7,r7,ror#20 + add r9,r9,r14 + mov r4,r4,ror#20 + eor r7,r7,r8,ror#20 + eor r4,r4,r9,ror#20 + add r2,r2,r7 + mov r10,r10,ror#24 + add r3,r3,r4 + mov r14,r14,ror#24 + eor r10,r10,r2,ror#24 + eor r14,r14,r3,ror#24 + add r8,r8,r10 + mov r7,r7,ror#25 + add r9,r9,r14 + mov r4,r4,ror#25 + eor r7,r7,r8,ror#25 + eor r4,r4,r9,ror#25 + bne Loop + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + cmp r11,#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr r8,[sp,#4*(0)] @ load key material + ldr r9,[sp,#4*(1)] + +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) +# if __ARM_ARCH__<7 + orr r10,r12,r14 + tst r10,#3 @ are input and output aligned? + ldr r10,[sp,#4*(2)] + bne Lunaligned + cmp r11,#64 @ restore flags +# else + ldr r10,[sp,#4*(2)] +# endif + ldr r11,[sp,#4*(3)] + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 @ xor with input + eorhs r1,r1,r9 + add r8,sp,#4*(4) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r1,[r14,#-12] + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 + add r8,sp,#4*(8) + str r4,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r5,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 + eorhs r1,r1,r9 + add r8,sp,#4*(12) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + str r1,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r4,[r14],#16 @ store output + str r5,[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi Loop_outer + + beq Ldone +# if __ARM_ARCH__<7 + b Ltail + +.align 4 +Lunaligned:@ unaligned endian-neutral path + cmp r11,#64 @ restore flags +# endif +#endif +#if __ARM_ARCH__<7 + ldr r11,[sp,#4*(3)] + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+0) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r0,sp,#4*(16+8) + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] + add r8,sp,#4*(4+4) + ldmia r8,{r8,r9,r10,r11} @ load key material + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" + strhi r11,[sp,#4*(16+11)] @ copy "rx" + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+8) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r4,r4,r8 @ accumulate key material +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + bhi Loop_outer + + beq Ldone +#endif + +Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add r9,sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +Loop_tail: + ldrb r10,[r9],#1 @ read buffer on stack + ldrb r11,[r12],#1 @ read input + subs r8,r8,#1 + eor r11,r11,r10 + strb r11,[r14],#1 @ store output + bne Loop_tail + +Ldone: + add sp,sp,#4*(32+3) +Lno_data: + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} + +#if __ARM_MAX_ARCH__>=7 + + + +#ifdef __thumb2__ +.thumb_func ChaCha20_neon +#endif +.align 5 +ChaCha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} +LChaCha20_neon: + adr r14,Lsigma + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so + stmdb sp!,{r0,r1,r2,r3} + + vld1.32 {q1,q2},[r3] @ load key + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {q3},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0,r1,r2,r3} @ load sigma + vld1.32 {q0},[r14]! @ load sigma + vld1.32 {q12},[r14] @ one + vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce + vst1.32 {q0,q1},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + vshl.i32 d26,d24,#1 @ two + vstr d24,[sp,#4*(16+0)] + vshl.i32 d28,d24,#2 @ four + vstr d26,[sp,#4*(16+2)] + vmov q4,q0 + vstr d28,[sp,#4*(16+4)] + vmov q8,q0 + vmov q5,q1 + vmov q9,q1 + b Loop_neon_enter + +.align 4 +Loop_neon_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + cmp r11,#64*2 @ if len<=64*2 + bls Lbreak_neon @ switch to integer-only + vmov q4,q0 + str r11,[sp,#4*(32+2)] @ save len + vmov q8,q0 + str r12, [sp,#4*(32+1)] @ save inp + vmov q5,q1 + str r14, [sp,#4*(32+0)] @ save out + vmov q9,q1 +Loop_neon_enter: + ldr r11, [sp,#4*(15)] + vadd.i32 q7,q3,q12 @ counter+1 + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + vmov q6,q2 + ldr r10, [sp,#4*(13)] + vmov q10,q2 + ldr r14,[sp,#4*(14)] + vadd.i32 q11,q7,q12 @ counter+2 + str r11, [sp,#4*(16+15)] + mov r11,#10 + add r12,r12,#3 @ counter+3 + b Loop_neon + +.align 4 +Loop_neon: + subs r11,r11,#1 + vadd.i32 q0,q0,q1 + add r0,r0,r4 + vadd.i32 q4,q4,q5 + mov r12,r12,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r5 + veor q3,q3,q0 + mov r10,r10,ror#16 + veor q7,q7,q4 + eor r12,r12,r0,ror#16 + veor q11,q11,q8 + eor r10,r10,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r12 + vrev32.16 q7,q7 + mov r4,r4,ror#20 + vrev32.16 q11,q11 + add r9,r9,r10 + vadd.i32 q2,q2,q3 + mov r5,r5,ror#20 + vadd.i32 q6,q6,q7 + eor r4,r4,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r5,r5,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r4 + veor q13,q5,q6 + mov r12,r12,ror#24 + veor q14,q9,q10 + add r1,r1,r5 + vshr.u32 q1,q12,#20 + mov r10,r10,ror#24 + vshr.u32 q5,q13,#20 + eor r12,r12,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r10,r10,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r12 + vsli.32 q5,q13,#12 + mov r4,r4,ror#25 + vsli.32 q9,q14,#12 + add r9,r9,r10 + vadd.i32 q0,q0,q1 + mov r5,r5,ror#25 + vadd.i32 q4,q4,q5 + str r10,[sp,#4*(16+13)] + vadd.i32 q8,q8,q9 + ldr r10,[sp,#4*(16+15)] + veor q12,q3,q0 + eor r4,r4,r8,ror#25 + veor q13,q7,q4 + eor r5,r5,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+8)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+10)] + vshr.u32 q7,q13,#24 + add r2,r2,r6 + vshr.u32 q11,q14,#24 + mov r14,r14,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+9)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+11)] + vsli.32 q11,q14,#8 + add r3,r3,r7 + vadd.i32 q2,q2,q3 + mov r10,r10,ror#16 + vadd.i32 q6,q6,q7 + eor r14,r14,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r10,r10,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r14 + veor q13,q5,q6 + mov r6,r6,ror#20 + veor q14,q9,q10 + add r9,r9,r10 + vshr.u32 q1,q12,#25 + mov r7,r7,ror#20 + vshr.u32 q5,q13,#25 + eor r6,r6,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r7,r7,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r6 + vsli.32 q5,q13,#7 + mov r14,r14,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r7 + vext.8 q2,q2,q2,#8 + mov r10,r10,ror#24 + vext.8 q6,q6,q6,#8 + eor r14,r14,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r10,r10,r3,ror#24 + vext.8 q1,q1,q1,#4 + add r8,r8,r14 + vext.8 q5,q5,q5,#4 + mov r6,r6,ror#25 + vext.8 q9,q9,q9,#4 + add r9,r9,r10 + vext.8 q3,q3,q3,#12 + mov r7,r7,ror#25 + vext.8 q7,q7,q7,#12 + eor r6,r6,r8,ror#25 + vext.8 q11,q11,q11,#12 + eor r7,r7,r9,ror#25 + vadd.i32 q0,q0,q1 + add r0,r0,r5 + vadd.i32 q4,q4,q5 + mov r10,r10,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r6 + veor q3,q3,q0 + mov r12,r12,ror#16 + veor q7,q7,q4 + eor r10,r10,r0,ror#16 + veor q11,q11,q8 + eor r12,r12,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r10 + vrev32.16 q7,q7 + mov r5,r5,ror#20 + vrev32.16 q11,q11 + add r9,r9,r12 + vadd.i32 q2,q2,q3 + mov r6,r6,ror#20 + vadd.i32 q6,q6,q7 + eor r5,r5,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r6,r6,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r5 + veor q13,q5,q6 + mov r10,r10,ror#24 + veor q14,q9,q10 + add r1,r1,r6 + vshr.u32 q1,q12,#20 + mov r12,r12,ror#24 + vshr.u32 q5,q13,#20 + eor r10,r10,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r12,r12,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r10 + vsli.32 q5,q13,#12 + mov r5,r5,ror#25 + vsli.32 q9,q14,#12 + str r10,[sp,#4*(16+15)] + vadd.i32 q0,q0,q1 + ldr r10,[sp,#4*(16+13)] + vadd.i32 q4,q4,q5 + add r9,r9,r12 + vadd.i32 q8,q8,q9 + mov r6,r6,ror#25 + veor q12,q3,q0 + eor r5,r5,r8,ror#25 + veor q13,q7,q4 + eor r6,r6,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+10)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+8)] + vshr.u32 q7,q13,#24 + add r2,r2,r7 + vshr.u32 q11,q14,#24 + mov r10,r10,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+11)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+9)] + vsli.32 q11,q14,#8 + add r3,r3,r4 + vadd.i32 q2,q2,q3 + mov r14,r14,ror#16 + vadd.i32 q6,q6,q7 + eor r10,r10,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r14,r14,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r10 + veor q13,q5,q6 + mov r7,r7,ror#20 + veor q14,q9,q10 + add r9,r9,r14 + vshr.u32 q1,q12,#25 + mov r4,r4,ror#20 + vshr.u32 q5,q13,#25 + eor r7,r7,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r4,r4,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r7 + vsli.32 q5,q13,#7 + mov r10,r10,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r4 + vext.8 q2,q2,q2,#8 + mov r14,r14,ror#24 + vext.8 q6,q6,q6,#8 + eor r10,r10,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r14,r14,r3,ror#24 + vext.8 q1,q1,q1,#12 + add r8,r8,r10 + vext.8 q5,q5,q5,#12 + mov r7,r7,ror#25 + vext.8 q9,q9,q9,#12 + add r9,r9,r14 + vext.8 q3,q3,q3,#4 + mov r4,r4,ror#25 + vext.8 q7,q7,q7,#4 + eor r7,r7,r8,ror#25 + vext.8 q11,q11,q11,#4 + eor r4,r4,r9,ror#25 + bne Loop_neon + + add r11,sp,#32 + vld1.32 {q12,q13},[sp] @ load key material + vld1.32 {q14,q15},[r11] + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 q0,q0,q12 @ accumulate key material + vadd.i32 q4,q4,q12 + vadd.i32 q8,q8,q12 + vldr d24,[sp,#4*(16+0)] @ one + + vadd.i32 q1,q1,q13 + vadd.i32 q5,q5,q13 + vadd.i32 q9,q9,q13 + vldr d26,[sp,#4*(16+2)] @ two + + vadd.i32 q2,q2,q14 + vadd.i32 q6,q6,q14 + vadd.i32 q10,q10,q14 + vadd.i32 d14,d14,d24 @ counter+1 + vadd.i32 d22,d22,d26 @ counter+2 + + vadd.i32 q3,q3,q15 + vadd.i32 q7,q7,q15 + vadd.i32 q11,q11,q15 + + cmp r11,#64*4 + blo Ltail_neon + + vld1.8 {q12,q13},[r12]! @ load input + mov r11,sp + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 @ xor with input + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + vst1.8 {q0,q1},[r14]! @ store output + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vld1.32 {q0,q1},[r11]! @ load for next iteration + veor d25,d25,d25 + vldr d24,[sp,#4*(16+4)] @ four + veor q9,q9,q13 + vld1.32 {q2,q3},[r11] + veor q10,q10,q14 + vst1.8 {q4,q5},[r14]! + veor q11,q11,q15 + vst1.8 {q6,q7},[r14]! + + vadd.i32 d6,d6,d24 @ next counter value + vldr d24,[sp,#4*(16+0)] @ one + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + vst1.8 {q8,q9},[r14]! + add r1,r1,r9 + ldr r9,[r12,#-12] + vst1.8 {q10,q11},[r14]! + add r2,r2,r10 + ldr r10,[r12,#-8] + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 @ xor with input + add r8,sp,#4*(4) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r5,r5,r9 + ldr r9,[r12,#-12] + add r6,r6,r10 + ldr r10,[r12,#-8] + add r7,r7,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 + add r8,sp,#4*(8) + eor r5,r5,r9 + str r4,[r14],#16 @ store output + eor r6,r6,r10 + str r5,[r14,#-12] + eor r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r1,r1,r9 + ldr r9,[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + add r2,r2,r10 + ldr r10,[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 + add r8,sp,#4*(12) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r8,r8,#4 @ next counter value + add r5,r5,r9 + str r8,[sp,#4*(12)] @ save next counter value + ldr r8,[r12],#16 @ load input + add r6,r6,r10 + add r4,r4,#3 @ counter+3 + ldr r9,[r12,#-12] + add r7,r7,r11 + ldr r10,[r12,#-8] + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 +# ifdef __thumb2__ + it hi +# endif + ldrhi r8,[sp,#4*(32+2)] @ re-load len + eor r5,r5,r9 + eor r6,r6,r10 + str r4,[r14],#16 @ store output + eor r7,r7,r11 + str r5,[r14,#-12] + sub r11,r8,#64*4 @ len-=64*4 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi Loop_neon_outer + + b Ldone_neon + +.align 4 +Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str r11, [sp,#4*(20+32+2)] @ save len + add r11,sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr r12,[sp,#4*(16+10)] + ldr r14,[sp,#4*(16+11)] + vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement + str r12,[sp,#4*(20+16+10)] @ copy "rx" + str r14,[sp,#4*(20+16+11)] @ copy "rx" + + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(20+16+15)] + add r11,sp,#4*(20) + vst1.32 {q0,q1},[r11]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {q2,q3},[r11] + mov r11,#10 + b Loop @ go integer-only + +.align 4 +Ltail_neon: + cmp r11,#64*3 + bhs L192_or_more_neon + cmp r11,#64*2 + bhs L128_or_more_neon + cmp r11,#64*1 + bhs L64_or_more_neon + + add r8,sp,#4*(8) + vst1.8 {q0,q1},[sp] + add r10,sp,#4*(0) + vst1.8 {q2,q3},[r8] + b Loop_tail_neon + +.align 4 +L64_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + veor q2,q2,q14 + veor q3,q3,q15 + vst1.8 {q0,q1},[r14]! + vst1.8 {q2,q3},[r14]! + + beq Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q4,q5},[sp] + add r10,sp,#4*(0) + vst1.8 {q6,q7},[r8] + sub r11,r11,#64*1 @ len-=64*1 + b Loop_tail_neon + +.align 4 +L128_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vst1.8 {q0,q1},[r14]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vst1.8 {q4,q5},[r14]! + vst1.8 {q6,q7},[r14]! + + beq Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q8,q9},[sp] + add r10,sp,#4*(0) + vst1.8 {q10,q11},[r8] + sub r11,r11,#64*2 @ len-=64*2 + b Loop_tail_neon + +.align 4 +L192_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q0,q1},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vst1.8 {q2,q3},[r14]! + veor q9,q9,q13 + vst1.8 {q4,q5},[r14]! + veor q10,q10,q14 + vst1.8 {q6,q7},[r14]! + veor q11,q11,q15 + vst1.8 {q8,q9},[r14]! + vst1.8 {q10,q11},[r14]! + + beq Ldone_neon + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(4) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r6,r6,r10 + add r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} + add r0,sp,#4*(16+8) + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(12) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r4,r4,#3 @ counter+3 + add r6,r6,r10 + add r7,r7,r11 + ldr r11,[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} + add r10,sp,#4*(0) + sub r11,r11,#64*3 @ len-=64*3 + +Loop_tail_neon: + ldrb r8,[r10],#1 @ read buffer on stack + ldrb r9,[r12],#1 @ read input + subs r11,r11,#1 + eor r8,r8,r9 + strb r8,[r14],#1 @ store output + bne Loop_tail_neon + +Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} + +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S new file mode 100644 index 0000000000..63e2ec7163 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aes-armv4.S @@ -0,0 +1,1233 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +#ifndef __KERNEL__ +# include +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES +@ instructions are in aesv8-armx.pl.) + + +.text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else +.code 32 +#undef __thumb2__ +#endif + + +.align 5 +AES_Te: +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a +@ Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +@ rcon[] +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 + + +@ void aes_nohw_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl _aes_nohw_encrypt +.private_extern _aes_nohw_encrypt +#ifdef __thumb2__ +.thumb_func _aes_nohw_encrypt +#endif +.align 5 +_aes_nohw_encrypt: +#ifndef __thumb2__ + sub r3,pc,#8 @ _aes_nohw_encrypt +#else + adr r3,. +#endif + stmdb sp!,{r1,r4-r12,lr} +#if defined(__thumb2__) || defined(__APPLE__) + adr r10,AES_Te +#else + sub r10,r3,#_aes_nohw_encrypt-AES_Te @ Te +#endif + mov r12,r0 @ inp + mov r11,r2 +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_encrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +#ifdef __thumb2__ +.thumb_func _armv4_AES_encrypt +#endif +.align 2 +_armv4_AES_encrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4,r5,r6,r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0 + and r8,lr,r0,lsr#8 + and r9,lr,r0,lsr#16 + mov r0,r0,lsr#24 +Lenc_loop: + ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] + and r8,lr,r1 + ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] + ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] + ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] + eor r0,r0,r7,ror#8 + ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,ror#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r6,r9,ror#8 + and r9,lr,r2 + ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] + eor r1,r1,r4,ror#24 + ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,ror#8 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r6,r9,ror#16 + and r9,lr,r3,lsr#16 @ i2 + ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] + eor r2,r2,r5,ror#16 + ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] + eor r0,r0,r7,ror#24 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] + eor r2,r2,r9,ror#8 + ldr r4,[r11,#-12] + eor r3,r3,r6,ror#8 + + ldr r5,[r11,#-8] + eor r0,r0,r7 + ldr r6,[r11,#-4] + and r7,lr,r0 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0,lsr#16 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne Lenc_loop + + add r10,r10,#2 + + ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] + and r8,lr,r1 + ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] + and r9,lr,r1,lsr#8 + ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] + mov r1,r1,lsr#24 + + ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] + ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] + ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] + eor r0,r7,r0,lsl#8 + ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,lsl#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] + eor r1,r4,r1,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] + mov r2,r2,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] + eor r0,r7,r0,lsl#8 + ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r3,lsr#16 @ i2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] + eor r2,r5,r2,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] + mov r3,r3,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] + eor r0,r7,r0,lsl#8 + ldr r7,[r11,#0] + ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r2,r9,lsl#16 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#2 + ldr pc,[sp],#4 @ pop and return + + +.globl _aes_nohw_set_encrypt_key +.private_extern _aes_nohw_set_encrypt_key +#ifdef __thumb2__ +.thumb_func _aes_nohw_set_encrypt_key +#endif +.align 5 +_aes_nohw_set_encrypt_key: +_armv4_AES_set_encrypt_key: +#ifndef __thumb2__ + sub r3,pc,#8 @ _aes_nohw_set_encrypt_key +#else + adr r3,. +#endif + teq r0,#0 +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq Labrt + teq r2,#0 +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + moveq r0,#-1 + beq Labrt + + teq r1,#128 + beq Lok + teq r1,#192 + beq Lok + teq r1,#256 +#ifdef __thumb2__ + itt ne @ Thumb2 thing, sanity check in ARM +#endif + movne r0,#-1 + bne Labrt + +Lok: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + mov r12,r0 @ inp + mov lr,r1 @ bits + mov r11,r2 @ key + +#if defined(__thumb2__) || defined(__APPLE__) + adr r10,AES_Te+1024 @ Te4 +#else + sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 +#endif + +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + str r0,[r11],#16 + orr r3,r3,r5,lsl#16 + str r1,[r11,#-12] + orr r3,r3,r6,lsl#24 + str r2,[r11,#-8] + str r3,[r11,#-4] +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r11],#16 + str r1,[r11,#-12] + str r2,[r11,#-8] + str r3,[r11,#-4] +#endif + + teq lr,#128 + bne Lnot128 + mov r12,#10 + str r12,[r11,#240-16] + add r6,r10,#256 @ rcon + mov lr,#255 + +L128_loop: + and r5,lr,r3,lsr#24 + and r7,lr,r3,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r3 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r5,r5,r4 + eor r0,r0,r5 @ rk[4]=rk[0]^... + eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] + str r0,[r11],#16 + eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] + str r1,[r11,#-12] + eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] + str r2,[r11,#-8] + subs r12,r12,#1 + str r3,[r11,#-4] + bne L128_loop + sub r2,r11,#176 + b Ldone + +Lnot128: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#19] + ldrb r4,[r12,#18] + ldrb r5,[r12,#17] + ldrb r6,[r12,#16] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#23] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#22] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#21] + ldrb r6,[r12,#20] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#16] + ldr r9,[r12,#20] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + teq lr,#192 + bne Lnot192 + mov r12,#12 + str r12,[r11,#240-24] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#8 + +L192_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[6]=rk[0]^... + eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] + str r0,[r11],#24 + eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] + str r1,[r11,#-20] + eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] + str r2,[r11,#-16] + subs r12,r12,#1 + str r3,[r11,#-12] +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#216 + beq Ldone + + ldr r7,[r11,#-32] + ldr r8,[r11,#-28] + eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] + eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] + str r7,[r11,#-8] + str r9,[r11,#-4] + b L192_loop + +Lnot192: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#27] + ldrb r4,[r12,#26] + ldrb r5,[r12,#25] + ldrb r6,[r12,#24] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#31] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#30] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#29] + ldrb r6,[r12,#28] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#24] + ldr r9,[r12,#28] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + mov r12,#14 + str r12,[r11,#240-32] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#7 + +L256_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[8]=rk[0]^... + eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] + str r0,[r11],#32 + eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] + str r1,[r11,#-28] + eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] + str r2,[r11,#-24] + subs r12,r12,#1 + str r3,[r11,#-20] +#ifdef __thumb2__ + itt eq @ Thumb2 thing, sanity check in ARM +#endif + subeq r2,r11,#256 + beq Ldone + + and r5,lr,r3 + and r7,lr,r3,lsr#8 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#16 + ldrb r7,[r10,r7] + and r9,lr,r3,lsr#24 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#8 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r11,#-48] + orr r5,r5,r9,lsl#24 + + ldr r7,[r11,#-44] + ldr r8,[r11,#-40] + eor r4,r4,r5 @ rk[12]=rk[4]^... + ldr r9,[r11,#-36] + eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] + str r4,[r11,#-16] + eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] + str r7,[r11,#-12] + eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] + str r8,[r11,#-8] + str r9,[r11,#-4] + b L256_loop + +.align 2 +Ldone: mov r0,#0 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} +Labrt: +#if __ARM_ARCH__>=5 + bx lr @ .word 0xe12fff1e +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +.globl _aes_nohw_set_decrypt_key +.private_extern _aes_nohw_set_decrypt_key +#ifdef __thumb2__ +.thumb_func _aes_nohw_set_decrypt_key +#endif +.align 5 +_aes_nohw_set_decrypt_key: + str lr,[sp,#-4]! @ push lr + bl _armv4_AES_set_encrypt_key + teq r0,#0 + ldr lr,[sp],#4 @ pop lr + bne Labrt + + mov r0,r2 @ _aes_nohw_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key + + +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.globl _AES_set_enc2dec_key +.private_extern _AES_set_enc2dec_key +#ifdef __thumb2__ +.thumb_func _AES_set_enc2dec_key +#endif +.align 5 +_AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + ldr r12,[r0,#240] + mov r7,r0 @ input + add r8,r0,r12,lsl#4 + mov r11,r1 @ output + add r10,r1,r12,lsl#4 + str r12,[r1,#240] + +Linv: ldr r0,[r7],#16 + ldr r1,[r7,#-12] + ldr r2,[r7,#-8] + ldr r3,[r7,#-4] + ldr r4,[r8],#-16 + ldr r5,[r8,#16+4] + ldr r6,[r8,#16+8] + ldr r9,[r8,#16+12] + str r0,[r10],#-16 + str r1,[r10,#16+4] + str r2,[r10,#16+8] + str r3,[r10,#16+12] + str r4,[r11],#16 + str r5,[r11,#-12] + str r6,[r11,#-8] + str r9,[r11,#-4] + teq r7,r8 + bne Linv + + ldr r0,[r7] + ldr r1,[r7,#4] + ldr r2,[r7,#8] + ldr r3,[r7,#12] + str r0,[r11] + str r1,[r11,#4] + str r2,[r11,#8] + str r3,[r11,#12] + sub r11,r11,r12,lsl#3 + ldr r0,[r11,#16]! @ prefetch tp1 + mov r7,#0x80 + mov r8,#0x1b + orr r7,r7,#0x8000 + orr r8,r8,#0x1b00 + orr r7,r7,r7,lsl#16 + orr r8,r8,r8,lsl#16 + sub r12,r12,#1 + mvn r9,r7 + mov r12,r12,lsl#2 @ (rounds-1)*4 + +Lmix: and r4,r0,r7 + and r1,r0,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r1,r4,r1,lsl#1 @ tp2 + + and r4,r1,r7 + and r2,r1,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r2,r4,r2,lsl#1 @ tp4 + + and r4,r2,r7 + and r3,r2,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r3,r4,r3,lsl#1 @ tp8 + + eor r4,r1,r2 + eor r5,r0,r3 @ tp9 + eor r4,r4,r3 @ tpe + eor r4,r4,r1,ror#24 + eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) + eor r4,r4,r2,ror#16 + eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) + eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) + + ldr r0,[r11,#4] @ prefetch tp1 + str r4,[r11],#4 + subs r12,r12,#1 + bne Lmix + + mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + + +.align 5 +AES_Td: +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 +@ Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + + +@ void aes_nohw_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl _aes_nohw_decrypt +.private_extern _aes_nohw_decrypt +#ifdef __thumb2__ +.thumb_func _aes_nohw_decrypt +#endif +.align 5 +_aes_nohw_decrypt: +#ifndef __thumb2__ + sub r3,pc,#8 @ _aes_nohw_decrypt +#else + adr r3,. +#endif + stmdb sp!,{r1,r4-r12,lr} +#if defined(__thumb2__) || defined(__APPLE__) + adr r10,AES_Td +#else + sub r10,r3,#_aes_nohw_decrypt-AES_Td @ Td +#endif + mov r12,r0 @ inp + mov r11,r2 +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_decrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +#ifdef __thumb2__ +.thumb_func _armv4_AES_decrypt +#endif +.align 2 +_armv4_AES_decrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4,r5,r6,r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0,lsr#16 + and r8,lr,r0,lsr#8 + and r9,lr,r0 + mov r0,r0,lsr#24 +Ldec_loop: + ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] + and r7,lr,r1 @ i0 + ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] + and r8,lr,r1,lsr#16 + ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] + ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] + ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] + eor r0,r0,r7,ror#24 + ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,ror#8 + and r8,lr,r2 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r2,lsr#16 + ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] + eor r1,r1,r4,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] + and r7,lr,r3,lsr#16 @ i0 + eor r1,r1,r8,ror#24 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r3 @ i2 + ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] + eor r2,r2,r5,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] + eor r0,r0,r7,ror#8 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] + eor r2,r2,r9,ror#24 + + ldr r4,[r11,#-12] + eor r0,r0,r7 + ldr r5,[r11,#-8] + eor r3,r3,r6,ror#8 + ldr r6,[r11,#-4] + and r7,lr,r0,lsr#16 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne Ldec_loop + + add r10,r10,#1024 + + ldr r5,[r10,#0] @ prefetch Td4 + ldr r6,[r10,#32] + ldr r4,[r10,#64] + ldr r5,[r10,#96] + ldr r6,[r10,#128] + ldr r4,[r10,#160] + ldr r5,[r10,#192] + ldr r6,[r10,#224] + + ldrb r0,[r10,r0] @ Td4[s0>>24] + ldrb r4,[r10,r7] @ Td4[s0>>16] + and r7,lr,r1 @ i0 + ldrb r5,[r10,r8] @ Td4[s0>>8] + and r8,lr,r1,lsr#16 + ldrb r6,[r10,r9] @ Td4[s0>>0] + and r9,lr,r1,lsr#8 + + add r1,r10,r1,lsr#24 + ldrb r7,[r10,r7] @ Td4[s1>>0] + ldrb r1,[r1] @ Td4[s1>>24] + ldrb r8,[r10,r8] @ Td4[s1>>16] + eor r0,r7,r0,lsl#24 + ldrb r9,[r10,r9] @ Td4[s1>>8] + eor r1,r4,r1,lsl#8 + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,lsl#8 + and r8,lr,r2 @ i1 + ldrb r7,[r10,r7] @ Td4[s2>>8] + eor r6,r6,r9,lsl#8 + ldrb r8,[r10,r8] @ Td4[s2>>0] + and r9,lr,r2,lsr#16 + + add r2,r10,r2,lsr#24 + ldrb r2,[r2] @ Td4[s2>>24] + eor r0,r0,r7,lsl#8 + ldrb r9,[r10,r9] @ Td4[s2>>16] + eor r1,r8,r1,lsl#16 + and r7,lr,r3,lsr#16 @ i0 + eor r2,r5,r2,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + ldrb r7,[r10,r7] @ Td4[s3>>16] + eor r6,r6,r9,lsl#16 + ldrb r8,[r10,r8] @ Td4[s3>>8] + and r9,lr,r3 @ i2 + + add r3,r10,r3,lsr#24 + ldrb r9,[r10,r9] @ Td4[s3>>0] + ldrb r3,[r3] @ Td4[s3>>24] + eor r0,r0,r7,lsl#16 + ldr r7,[r11,#0] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r9,r2,lsl#8 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#1024 + ldr pc,[sp],#4 @ pop and return + +.byte 65,69,83,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S new file mode 100644 index 0000000000..7392231df2 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/aesv8-armx32.S @@ -0,0 +1,790 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +#if __ARM_MAX_ARCH__>=7 +.text + + +.code 32 +#undef __thumb2__ +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key +#ifdef __thumb2__ +.thumb_func _aes_hw_set_encrypt_key +#endif +.align 5 +_aes_hw_set_encrypt_key: +Lenc_key: + mov r3,#-1 + cmp r0,#0 + beq Lenc_key_abort + cmp r2,#0 + beq Lenc_key_abort + mov r3,#-2 + cmp r1,#128 + blt Lenc_key_abort + cmp r1,#256 + bgt Lenc_key_abort + tst r1,#0x3f + bne Lenc_key_abort + + adr r3,Lrcon + cmp r1,#192 + + veor q0,q0,q0 + vld1.8 {q3},[r0]! + mov r1,#8 @ reuse r1 + vld1.32 {q1,q2},[r3]! + + blt Loop128 + beq L192 + b L256 + +.align 4 +Loop128: + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + bne Loop128 + + vld1.32 {q1},[r3] + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + veor q3,q3,q10 + vst1.32 {q3},[r2] + add r2,r2,#0x50 + + mov r12,#10 + b Ldone + +.align 4 +L192: + vld1.8 {d16},[r0]! + vmov.i8 q10,#8 @ borrow q10 + vst1.32 {q3},[r2]! + vsub.i8 q2,q2,q10 @ adjust the mask + +Loop192: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {d16},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + + vdup.32 q9,d7[1] + veor q9,q9,q8 + veor q10,q10,q1 + vext.8 q8,q0,q8,#12 + vshl.u8 q1,q1,#1 + veor q8,q8,q9 + veor q3,q3,q10 + veor q8,q8,q10 + vst1.32 {q3},[r2]! + bne Loop192 + + mov r12,#12 + add r2,r2,#0x20 + b Ldone + +.align 4 +L256: + vld1.8 {q8},[r0] + mov r1,#7 + mov r12,#14 + vst1.32 {q3},[r2]! + +Loop256: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q8},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + vst1.32 {q3},[r2]! + beq Ldone + + vdup.32 q10,d7[1] + vext.8 q9,q0,q8,#12 +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + + veor q8,q8,q10 + b Loop256 + +Ldone: + str r12,[r2] + mov r3,#0 + +Lenc_key_abort: + mov r0,r3 @ return value + + bx lr + + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key +#ifdef __thumb2__ +.thumb_func _aes_hw_set_decrypt_key +#endif +.align 5 +_aes_hw_set_decrypt_key: + stmdb sp!,{r4,lr} + bl Lenc_key + + cmp r0,#0 + bne Ldec_key_abort + + sub r2,r2,#240 @ restore original r2 + mov r4,#-16 + add r0,r2,r12,lsl#4 @ end of key schedule + + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + +Loop_imc: + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + cmp r0,r2 + bhi Loop_imc + + vld1.32 {q0},[r2] +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vst1.32 {q0},[r0] + + eor r0,r0,r0 @ return value +Ldec_key_abort: + ldmia sp!,{r4,pc} + +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt +#ifdef __thumb2__ +.thumb_func _aes_hw_encrypt +#endif +.align 5 +_aes_hw_encrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +Loop_enc: +.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2]! + subs r3,r3,#2 +.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q1},[r2]! + bgt Loop_enc + +.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2] +.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt +#ifdef __thumb2__ +.thumb_func _aes_hw_decrypt +#endif +.align 5 +_aes_hw_decrypt: + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +Loop_dec: +.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2]! + subs r3,r3,#2 +.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q1},[r2]! + bgt Loop_dec + +.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2] +.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt +#ifdef __thumb2__ +.thumb_func _aes_hw_cbc_encrypt +#endif +.align 5 +_aes_hw_cbc_encrypt: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo Lcbc_abort + moveq r8,#0 + + cmp r5,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q6},[r4] + vld1.8 {q0},[r0],r8 + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10,q11},[r7]! + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq Lcbc_dec + + cmp r5,#2 + veor q0,q0,q6 + veor q5,q8,q7 + beq Lcbc_enc128 + + vld1.32 {q2,q3},[r7] + add r7,r3,#16 + add r6,r3,#16*4 + add r12,r3,#16*5 +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r14,r3,#16*6 + add r3,r3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +Lenter_cbc_enc: +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r6] + cmp r5,#4 +.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r12] + beq Lcbc_enc192 + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r14] +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r3] + nop + +Lcbc_enc192: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 +.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r7] @ re-pre-load rndkey[1] +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs Loop_cbc_enc + + vst1.8 {q6},[r1]! + b Lcbc_done + +.align 5 +Lcbc_enc128: + vld1.32 {q2,q3},[r7] +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + b Lenter_cbc_enc128 +Loop_cbc_enc128: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +Lenter_cbc_enc128: +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 +.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 +.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs Loop_cbc_enc128 + + vst1.8 {q6},[r1]! + b Lcbc_done +.align 5 +Lcbc_dec: + vld1.8 {q10},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q0,q0 + vorr q1,q0,q0 + vorr q11,q10,q10 + blo Lcbc_dec_tail + + vorr q1,q10,q10 + vld1.8 {q10},[r0]! + vorr q2,q0,q0 + vorr q3,q1,q1 + vorr q11,q10,q10 + +Loop3x_cbc_dec: +.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt Loop3x_cbc_dec + +.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q4,q6,q7 + subs r2,r2,#0x30 + veor q5,q2,q7 + movlo r6,r2 @ r6, r6, is zero at this point +.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q9,q3,q7 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + vorr q6,q11,q11 + mov r7,r3 +.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q2},[r0]! +.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q3},[r0]! +.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q11},[r0]! +.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 +.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 +.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q4,q0 + veor q5,q5,q1 + veor q10,q10,q9 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs Loop3x_cbc_dec + + cmn r2,#0x30 + beq Lcbc_done + nop + +Lcbc_dec_tail: +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt Lcbc_dec_tail + +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 +.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + cmn r2,#0x20 +.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q5,q6,q7 +.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q9,q3,q7 +.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 +.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + beq Lcbc_dec_one + veor q5,q5,q1 + veor q9,q9,q10 + vorr q6,q11,q11 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b Lcbc_done + +Lcbc_dec_one: + veor q5,q5,q10 + vorr q6,q11,q11 + vst1.8 {q5},[r1]! + +Lcbc_done: + vst1.8 {q6},[r4] +Lcbc_abort: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,pc} + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks +#ifdef __thumb2__ +.thumb_func _aes_hw_ctr32_encrypt_blocks +#endif +.align 5 +_aes_hw_ctr32_encrypt_blocks: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg + ldr r5,[r3,#240] + + ldr r8, [r4, #12] + vld1.32 {q0},[r4] + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#4 + mov r12,#16 + cmp r2,#2 + add r7,r3,r5,lsl#4 @ pointer to last 5 round keys + sub r5,r5,#2 + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + add r7,r3,#32 + mov r6,r5 + movlo r12,#0 +#ifndef __ARMEB__ + rev r8, r8 +#endif + vorr q1,q0,q0 + add r10, r8, #1 + vorr q10,q0,q0 + add r8, r8, #2 + vorr q6,q0,q0 + rev r10, r10 + vmov.32 d3[1],r10 + bls Lctr32_tail + rev r12, r8 + sub r2,r2,#3 @ bias + vmov.32 d21[1],r12 + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt Loop3x_ctr32 + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + vld1.8 {q2},[r0]! + vorr q0,q6,q6 +.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.8 {q3},[r0]! + vorr q1,q6,q6 +.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.8 {q11},[r0]! + mov r7,r3 +.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 +.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 + vorr q10,q6,q6 + add r9,r8,#1 +.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q2,q2,q7 + add r10,r8,#2 +.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + veor q3,q3,q7 + add r8,r8,#3 +.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q11,q11,q7 + rev r9,r9 +.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + vmov.32 d1[1], r9 + rev r10,r10 +.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vmov.32 d3[1], r10 + rev r12,r8 +.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + vmov.32 d21[1], r12 + subs r2,r2,#3 +.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 +.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 +.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 + + veor q2,q2,q4 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vst1.8 {q2},[r1]! + veor q3,q3,q5 + mov r6,r5 + vst1.8 {q3},[r1]! + veor q11,q11,q9 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q11},[r1]! + bhs Loop3x_ctr32 + + adds r2,r2,#3 + beq Lctr32_done + cmp r2,#1 + mov r12,#16 + moveq r12,#0 + +Lctr32_tail: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q9},[r7]! + bgt Lctr32_tail + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q2},[r0],r12 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0] +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q2,q2,q7 +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q3,q3,q7 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 +.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + + cmp r2,#1 + veor q2,q2,q0 + veor q3,q3,q1 + vst1.8 {q2},[r1]! + beq Lctr32_done + vst1.8 {q3},[r1] + +Lctr32_done: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} + +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S new file mode 100644 index 0000000000..e549d1f163 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/armv4-mont.S @@ -0,0 +1,982 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. + + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +#if __ARM_MAX_ARCH__>=7 +.align 5 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lbn_mul_mont +#endif + +.globl _bn_mul_mont +.private_extern _bn_mul_mont +#ifdef __thumb2__ +.thumb_func _bn_mul_mont +#endif + +.align 5 +_bn_mul_mont: +Lbn_mul_mont: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block +#if __ARM_MAX_ARCH__>=7 + tst ip,#7 + bne Lialu + adr r0,Lbn_mul_mont + ldr r2,LOPENSSL_armcap + ldr r0,[r0,r2] +#ifdef __APPLE__ + ldr r0,[r0] +#endif + tst r0,#ARMV7_NEON @ NEON available? + ldmia sp, {r0,r2} + beq Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +Lialu: +#endif + cmp ip,#2 + mov r0,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif + movlt r0,#0 + addlt sp,sp,#2*4 + blt Labrt + + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + mov r7,sp + str r14,[r0,#4] @ tp[num]= + +Louter: + sub r7,r0,r7 @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 +#ifdef __thumb2__ + itt ne +#endif + movne r7,sp + bne Louter + + ldr r2,[r0,#12*4] @ pull rp + mov r5,sp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,r5 @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + +Lcopy: ldr r7,[r4] @ conditional copy + ldr r5,[r2] + str sp,[r4],#4 @ zap tp +#ifdef __thumb2__ + it cc +#endif + movcc r5,r7 + str r5,[r2],#4 + teq r4,r0 @ preserve carry + bne Lcopy + + mov sp,r0 + add sp,sp,#4 @ skip over tp[num+1] + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +Labrt: +#if __ARM_ARCH__>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +#ifdef __thumb2__ +.thumb_func bn_mul8x_mont_neon +#endif +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load rest of parameter block + mov ip,sp + + cmp r5,#8 + bhi LNEON_8n + + @ special case for r5==8, everything is in register bank... + + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + sub r7,sp,r5,lsl#4 + vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + vmul.u32 d29,d29,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4,d5,d6,d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b LNEON_outer8 + +.align 4 +LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d29,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#96 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b LNEON_tail_entry + +.align 4 +LNEON_8n: + veor q6,q6,q6 + sub r7,sp,#128 + veor q7,q7,q7 + sub r7,r7,r5,lsl#4 + veor q8,q8,q8 + and r7,r7,#-64 + veor q9,q9,q9 + mov sp,r7 @ alloca + veor q10,q10,q10 + add r7,r7,#256 + veor q11,q11,q11 + sub r8,r5,#8 + veor q12,q12,q12 + veor q13,q13,q13 + +LNEON_8n_init: + vst1.64 {q6,q7},[r7,:256]! + subs r8,r8,#8 + vst1.64 {q8,q9},[r7,:256]! + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12,q13},[r7,:256]! + bne LNEON_8n_init + + add r6,sp,#256 + vld1.32 {d0,d1,d2,d3},[r1]! + add r10,sp,#8 + vld1.32 {d30[0]},[r4,:32] + mov r9,r5 + b LNEON_8n_outer + +.align 4 +LNEON_8n_outer: + vld1.32 {d28[0]},[r2,:32]! @ *b++ + veor d8,d8,d8 + vzip.16 d28,d8 + add r7,sp,#128 + vld1.32 {d4,d5,d6,d7},[r3]! + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + vadd.u64 d29,d29,d12 + vmlal.u32 q10,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q11,d28,d2[1] + vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q6,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q7,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q8,d29,d5[0] + vshr.u64 d12,d12,#16 + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vadd.u64 d12,d12,d13 + vmlal.u32 q11,d29,d6[1] + vshr.u64 d12,d12,#16 + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vadd.u64 d14,d14,d12 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128]! + vmlal.u32 q8,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q9,d28,d1[0] + vshl.i64 d29,d15,#16 + vmlal.u32 q10,d28,d1[1] + vadd.u64 d29,d29,d14 + vmlal.u32 q11,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q12,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] + vmlal.u32 q13,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q7,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q8,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q9,d29,d5[0] + vshr.u64 d14,d14,#16 + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vadd.u64 d14,d14,d15 + vmlal.u32 q12,d29,d6[1] + vshr.u64 d14,d14,#16 + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vadd.u64 d16,d16,d14 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128]! + vmlal.u32 q9,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q10,d28,d1[0] + vshl.i64 d29,d17,#16 + vmlal.u32 q11,d28,d1[1] + vadd.u64 d29,d29,d16 + vmlal.u32 q12,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q13,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] + vmlal.u32 q6,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q8,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q9,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q10,d29,d5[0] + vshr.u64 d16,d16,#16 + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vadd.u64 d16,d16,d17 + vmlal.u32 q13,d29,d6[1] + vshr.u64 d16,d16,#16 + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vadd.u64 d18,d18,d16 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128]! + vmlal.u32 q10,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q11,d28,d1[0] + vshl.i64 d29,d19,#16 + vmlal.u32 q12,d28,d1[1] + vadd.u64 d29,d29,d18 + vmlal.u32 q13,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q6,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] + vmlal.u32 q7,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q9,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q10,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q11,d29,d5[0] + vshr.u64 d18,d18,#16 + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vadd.u64 d18,d18,d19 + vmlal.u32 q6,d29,d6[1] + vshr.u64 d18,d18,#16 + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vadd.u64 d20,d20,d18 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128]! + vmlal.u32 q11,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q12,d28,d1[0] + vshl.i64 d29,d21,#16 + vmlal.u32 q13,d28,d1[1] + vadd.u64 d29,d29,d20 + vmlal.u32 q6,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q7,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] + vmlal.u32 q8,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q10,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q11,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q12,d29,d5[0] + vshr.u64 d20,d20,#16 + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vadd.u64 d20,d20,d21 + vmlal.u32 q7,d29,d6[1] + vshr.u64 d20,d20,#16 + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vadd.u64 d22,d22,d20 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128]! + vmlal.u32 q12,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q13,d28,d1[0] + vshl.i64 d29,d23,#16 + vmlal.u32 q6,d28,d1[1] + vadd.u64 d29,d29,d22 + vmlal.u32 q7,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q8,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] + vmlal.u32 q9,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q11,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q12,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q13,d29,d5[0] + vshr.u64 d22,d22,#16 + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vadd.u64 d22,d22,d23 + vmlal.u32 q8,d29,d6[1] + vshr.u64 d22,d22,#16 + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vadd.u64 d24,d24,d22 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128]! + vmlal.u32 q13,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q6,d28,d1[0] + vshl.i64 d29,d25,#16 + vmlal.u32 q7,d28,d1[1] + vadd.u64 d29,d29,d24 + vmlal.u32 q8,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q9,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] + vmlal.u32 q10,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q12,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q13,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q6,d29,d5[0] + vshr.u64 d24,d24,#16 + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vadd.u64 d24,d24,d25 + vmlal.u32 q9,d29,d6[1] + vshr.u64 d24,d24,#16 + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vadd.u64 d26,d26,d24 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128]! + vmlal.u32 q6,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q7,d28,d1[0] + vshl.i64 d29,d27,#16 + vmlal.u32 q8,d28,d1[1] + vadd.u64 d29,d29,d26 + vmlal.u32 q9,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q10,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] + vmlal.u32 q11,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q12,d28,d3[1] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q13,d29,d4[0] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d29,d5[0] + vshr.u64 d26,d26,#16 + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vadd.u64 d26,d26,d27 + vmlal.u32 q10,d29,d6[1] + vshr.u64 d26,d26,#16 + vmlal.u32 q11,d29,d7[0] + vmlal.u32 q12,d29,d7[1] + vadd.u64 d12,d12,d26 + vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] + add r10,sp,#8 @ rewind + sub r8,r5,#8 + b LNEON_8n_inner + +.align 4 +LNEON_8n_inner: + subs r8,r8,#8 + vmlal.u32 q6,d28,d0[0] + vld1.64 {q13},[r6,:128] + vmlal.u32 q7,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] + vmlal.u32 q8,d28,d1[0] + vld1.32 {d4,d5,d6,d7},[r3]! + vmlal.u32 q9,d28,d1[1] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vmlal.u32 q11,d29,d6[1] + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q6},[r7,:128]! + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128] + vmlal.u32 q8,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] + vmlal.u32 q9,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d1[1] + vmlal.u32 q11,d28,d2[0] + vmlal.u32 q12,d28,d2[1] + vmlal.u32 q13,d28,d3[0] + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] + vmlal.u32 q7,d29,d4[0] + vmlal.u32 q8,d29,d4[1] + vmlal.u32 q9,d29,d5[0] + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vmlal.u32 q12,d29,d6[1] + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vst1.64 {q7},[r7,:128]! + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128] + vmlal.u32 q9,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] + vmlal.u32 q10,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q11,d28,d1[1] + vmlal.u32 q12,d28,d2[0] + vmlal.u32 q13,d28,d2[1] + vmlal.u32 q6,d28,d3[0] + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] + vmlal.u32 q8,d29,d4[0] + vmlal.u32 q9,d29,d4[1] + vmlal.u32 q10,d29,d5[0] + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vmlal.u32 q13,d29,d6[1] + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vst1.64 {q8},[r7,:128]! + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128] + vmlal.u32 q10,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] + vmlal.u32 q11,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q12,d28,d1[1] + vmlal.u32 q13,d28,d2[0] + vmlal.u32 q6,d28,d2[1] + vmlal.u32 q7,d28,d3[0] + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] + vmlal.u32 q9,d29,d4[0] + vmlal.u32 q10,d29,d4[1] + vmlal.u32 q11,d29,d5[0] + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vmlal.u32 q6,d29,d6[1] + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vst1.64 {q9},[r7,:128]! + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128] + vmlal.u32 q11,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] + vmlal.u32 q12,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q13,d28,d1[1] + vmlal.u32 q6,d28,d2[0] + vmlal.u32 q7,d28,d2[1] + vmlal.u32 q8,d28,d3[0] + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] + vmlal.u32 q10,d29,d4[0] + vmlal.u32 q11,d29,d4[1] + vmlal.u32 q12,d29,d5[0] + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vmlal.u32 q7,d29,d6[1] + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vst1.64 {q10},[r7,:128]! + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128] + vmlal.u32 q12,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] + vmlal.u32 q13,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q6,d28,d1[1] + vmlal.u32 q7,d28,d2[0] + vmlal.u32 q8,d28,d2[1] + vmlal.u32 q9,d28,d3[0] + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] + vmlal.u32 q11,d29,d4[0] + vmlal.u32 q12,d29,d4[1] + vmlal.u32 q13,d29,d5[0] + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vmlal.u32 q8,d29,d6[1] + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vst1.64 {q11},[r7,:128]! + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128] + vmlal.u32 q13,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] + vmlal.u32 q6,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q7,d28,d1[1] + vmlal.u32 q8,d28,d2[0] + vmlal.u32 q9,d28,d2[1] + vmlal.u32 q10,d28,d3[0] + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] + vmlal.u32 q12,d29,d4[0] + vmlal.u32 q13,d29,d4[1] + vmlal.u32 q6,d29,d5[0] + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vmlal.u32 q9,d29,d6[1] + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vst1.64 {q12},[r7,:128]! + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128] + vmlal.u32 q6,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] + vmlal.u32 q7,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q8,d28,d1[1] + vmlal.u32 q9,d28,d2[0] + vmlal.u32 q10,d28,d2[1] + vmlal.u32 q11,d28,d3[0] + vmlal.u32 q12,d28,d3[1] + it eq + subeq r1,r1,r5,lsl#2 @ rewind + vmlal.u32 q13,d29,d4[0] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q6,d29,d4[1] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q7,d29,d5[0] + add r10,sp,#8 @ rewind + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vmlal.u32 q10,d29,d6[1] + vmlal.u32 q11,d29,d7[0] + vst1.64 {q13},[r7,:128]! + vmlal.u32 q12,d29,d7[1] + + bne LNEON_8n_inner + add r6,sp,#128 + vst1.64 {q6,q7},[r7,:256]! + veor q2,q2,q2 @ d4-d5 + vst1.64 {q8,q9},[r7,:256]! + veor q3,q3,q3 @ d6-d7 + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12},[r7,:128] + + subs r9,r9,#8 + vld1.64 {q6,q7},[r6,:256]! + vld1.64 {q8,q9},[r6,:256]! + vld1.64 {q10,q11},[r6,:256]! + vld1.64 {q12,q13},[r6,:256]! + + itt ne + subne r3,r3,r5,lsl#2 @ rewind + bne LNEON_8n_outer + + add r7,sp,#128 + vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 d10,d12,#16 + vst1.64 {q2,q3},[sp,:256]! + vadd.u64 d13,d13,d10 + vst1.64 {q2,q3}, [sp,:256]! + vshr.u64 d10,d13,#16 + vst1.64 {q2,q3}, [sp,:256]! + vzip.16 d12,d13 + + mov r8,r5 + b LNEON_tail_entry + +.align 4 +LNEON_tail: + vadd.u64 d12,d12,d10 + vshr.u64 d10,d12,#16 + vld1.64 {q8,q9}, [r6, :256]! + vadd.u64 d13,d13,d10 + vld1.64 {q10,q11}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q12,q13}, [r6, :256]! + vzip.16 d12,d13 + +LNEON_tail_entry: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + vld1.64 {q6,q7}, [r6, :256]! + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + bne LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +LNEON_sub: + ldmia r1!, {r4,r5,r6,r7} + ldmia r3!, {r8,r9,r10,r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne LNEON_sub + + ldr r10, [r1] @ load top-most bit + mov r11,sp + veor q0,q0,q0 + sub r11,r2,r11 @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +LNEON_copy_n_zap: + ldmia r1!, {r4,r5,r6,r7} + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + ldmia r1, {r4,r5,r6,r7} + stmia r0!, {r8,r9,r10,r11} + sub r1,r1,#16 + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r1,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne LNEON_copy_n_zap + + mov sp,ip + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + bx lr @ bx lr + +#endif +.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S new file mode 100644 index 0000000000..8329a8c202 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/bsaes-armv7.S @@ -0,0 +1,1536 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel +@ of Linaro. Permission to use under GPL terms is granted. +@ ==================================================================== + +@ Bit-sliced AES for ARM NEON +@ +@ February 2012. +@ +@ This implementation is direct adaptation of bsaes-x86_64 module for +@ ARM NEON. Except that this module is endian-neutral [in sense that +@ it can be compiled for either endianness] by courtesy of vld1.8's +@ neutrality. Initial version doesn't implement interface to OpenSSL, +@ only low-level primitives and unsupported entry points, just enough +@ to collect performance results, which for Cortex-A8 core are: +@ +@ encrypt 19.5 cycles per byte processed with 128-bit key +@ decrypt 22.1 cycles per byte processed with 128-bit key +@ key conv. 440 cycles per 128-bit key/0.18 of 8x block +@ +@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +@ which is [much] worse than anticipated (for further details see +@ http://www.openssl.org/~appro/Snapdragon-S4.html). +@ +@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +@ manages in 20.0 cycles]. +@ +@ When comparing to x86_64 results keep in mind that NEON unit is +@ [mostly] single-issue and thus can't [fully] benefit from +@ instruction-level parallelism. And when comparing to aes-armv4 +@ results keep in mind key schedule conversion overhead (see +@ bsaes-x86_64.pl for further details)... +@ +@ + +@ April-August 2013 +@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. + +#ifndef __KERNEL__ +# include + +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#if defined(__thumb2__) && !defined(__APPLE__) +.thumb +#else +.code 32 +# undef __thumb2__ +#endif + +#ifdef __thumb2__ +.thumb_func _bsaes_decrypt8 +#endif +.align 4 +_bsaes_decrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,LM0ISR +#else + add r6,r6,#LM0ISR-_bsaes_decrypt8 +#endif + + vldmia r6!, {q8} @ LM0ISR + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b Ldec_sbox +.align 4 +Ldec_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +Ldec_sbox: + veor q1, q1, q4 + veor q3, q3, q4 + + veor q4, q4, q7 + veor q1, q1, q6 + veor q2, q2, q7 + veor q6, q6, q4 + + veor q0, q0, q1 + veor q2, q2, q5 + veor q7, q7, q6 + veor q3, q3, q0 + veor q5, q5, q0 + veor q1, q1, q3 + veor q11, q3, q0 + veor q10, q7, q4 + veor q9, q1, q6 + veor q13, q4, q0 + vmov q8, q10 + veor q12, q5, q2 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q6, q2 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q3, q7 + veor q12, q1, q5 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q4, q6 + veor q9, q9, q14 + vand q13, q0, q2 + vand q14, q7, q1 + vorr q15, q3, q5 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q5, q2 + veor q8, q1, q6 + veor q10, q15, q14 + vand q10, q10, q5 + veor q5, q5, q1 + vand q11, q1, q15 + vand q5, q5, q14 + veor q1, q11, q10 + veor q5, q5, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q2 + veor q12, q12, q8 + veor q2, q2, q6 + vand q8, q8, q15 + vand q6, q6, q13 + vand q12, q12, q14 + vand q2, q2, q9 + veor q8, q8, q12 + veor q2, q2, q6 + veor q12, q12, q11 + veor q6, q6, q10 + veor q5, q5, q12 + veor q2, q2, q12 + veor q1, q1, q8 + veor q6, q6, q8 + + veor q12, q3, q0 + veor q8, q7, q4 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q4 + vand q8, q8, q15 + vand q4, q4, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q4 + veor q12, q12, q11 + veor q4, q4, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q3 + veor q3, q3, q7 + vand q11, q7, q15 + vand q3, q3, q14 + veor q7, q11, q10 + veor q3, q3, q11 + veor q3, q3, q12 + veor q0, q0, q12 + veor q7, q7, q8 + veor q4, q4, q8 + veor q1, q1, q7 + veor q6, q6, q5 + + veor q4, q4, q1 + veor q2, q2, q7 + veor q5, q5, q7 + veor q4, q4, q2 + veor q7, q7, q0 + veor q4, q4, q5 + veor q3, q3, q6 + veor q6, q6, q1 + veor q3, q3, q4 + + veor q4, q4, q0 + veor q7, q7, q3 + subs r5,r5,#1 + bcc Ldec_done + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 q8, q0, q0, #8 + vext.8 q14, q3, q3, #8 + vext.8 q15, q5, q5, #8 + veor q8, q8, q0 + vext.8 q9, q1, q1, #8 + veor q14, q14, q3 + vext.8 q10, q6, q6, #8 + veor q15, q15, q5 + vext.8 q11, q4, q4, #8 + veor q9, q9, q1 + vext.8 q12, q2, q2, #8 + veor q10, q10, q6 + vext.8 q13, q7, q7, #8 + veor q11, q11, q4 + veor q12, q12, q2 + veor q13, q13, q7 + + veor q0, q0, q14 + veor q1, q1, q14 + veor q6, q6, q8 + veor q2, q2, q10 + veor q4, q4, q9 + veor q1, q1, q15 + veor q6, q6, q15 + veor q2, q2, q14 + veor q7, q7, q11 + veor q4, q4, q14 + veor q3, q3, q12 + veor q2, q2, q15 + veor q7, q7, q15 + veor q5, q5, q13 + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q6, q6, #12 + veor q1, q1, q9 + vext.8 q11, q4, q4, #12 + veor q6, q6, q10 + vext.8 q12, q2, q2, #12 + veor q4, q4, q11 + vext.8 q13, q7, q7, #12 + veor q2, q2, q12 + vext.8 q14, q3, q3, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q3, q3, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q2 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q2, q2, #8 + veor q12, q12, q4 + vext.8 q9, q7, q7, #8 + veor q15, q15, q3 + vext.8 q2, q4, q4, #8 + veor q11, q11, q6 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q4, q3, q3, #8 + veor q11, q11, q5 + vext.8 q3, q6, q6, #8 + veor q5, q9, q13 + veor q11, q11, q2 + veor q7, q7, q15 + veor q6, q4, q14 + veor q4, q8, q12 + veor q2, q3, q10 + vmov q3, q11 + @ vmov q5, q9 + vldmia r6, {q12} @ LISR + ite eq @ Thumb2 thing, sanity check in ARM + addeq r6,r6,#0x10 + bne Ldec_loop + vldmia r6, {q12} @ LISRM0 + b Ldec_loop +.align 4 +Ldec_done: + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q3, #1 + vshr.u64 q11, q2, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q4 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q4, q4, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q2, #2 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q3, q3, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q4 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q4, q4, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q4, #4 + vshr.u64 q11, q6, #4 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q4, q4, q10 + veor q6, q6, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q6, q6, q8 + veor q4, q4, q8 + veor q2, q2, q8 + veor q7, q7, q8 + veor q3, q3, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr + + + +.align 6 +_bsaes_const: +LM0ISR:@ InvShiftRows constants +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d +LM0SR:@ ShiftRows constants +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d +LM0: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 6 + + +#ifdef __thumb2__ +.thumb_func _bsaes_encrypt8 +#endif +.align 4 +_bsaes_encrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,LM0SR +#else + sub r6,r6,#_bsaes_encrypt8-LM0SR +#endif + + vldmia r6!, {q8} @ LM0SR +_bsaes_encrypt8_alt: + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 +_bsaes_encrypt8_bitslice: + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b Lenc_sbox +.align 4 +Lenc_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +Lenc_sbox: + veor q2, q2, q1 + veor q5, q5, q6 + veor q3, q3, q0 + veor q6, q6, q2 + veor q5, q5, q0 + + veor q6, q6, q3 + veor q3, q3, q7 + veor q7, q7, q5 + veor q3, q3, q4 + veor q4, q4, q5 + + veor q2, q2, q7 + veor q3, q3, q1 + veor q1, q1, q5 + veor q11, q7, q4 + veor q10, q1, q2 + veor q9, q5, q3 + veor q13, q2, q4 + vmov q8, q10 + veor q12, q6, q0 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q3, q0 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q7, q1 + veor q12, q5, q6 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q2, q3 + veor q9, q9, q14 + vand q13, q4, q0 + vand q14, q1, q5 + vorr q15, q7, q6 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q6, q0 + veor q8, q5, q3 + veor q10, q15, q14 + vand q10, q10, q6 + veor q6, q6, q5 + vand q11, q5, q15 + vand q6, q6, q14 + veor q5, q11, q10 + veor q6, q6, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q3 + vand q8, q8, q15 + vand q3, q3, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q3 + veor q12, q12, q11 + veor q3, q3, q10 + veor q6, q6, q12 + veor q0, q0, q12 + veor q5, q5, q8 + veor q3, q3, q8 + + veor q12, q7, q4 + veor q8, q1, q2 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q4 + veor q12, q12, q8 + veor q4, q4, q2 + vand q8, q8, q15 + vand q2, q2, q13 + vand q12, q12, q14 + vand q4, q4, q9 + veor q8, q8, q12 + veor q4, q4, q2 + veor q12, q12, q11 + veor q2, q2, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q7 + veor q7, q7, q1 + vand q11, q1, q15 + vand q7, q7, q14 + veor q1, q11, q10 + veor q7, q7, q11 + veor q7, q7, q12 + veor q4, q4, q12 + veor q1, q1, q8 + veor q2, q2, q8 + veor q7, q7, q0 + veor q1, q1, q6 + veor q6, q6, q0 + veor q4, q4, q7 + veor q0, q0, q1 + + veor q1, q1, q5 + veor q5, q5, q2 + veor q2, q2, q3 + veor q3, q3, q5 + veor q4, q4, q5 + + veor q6, q6, q3 + subs r5,r5,#1 + bcc Lenc_done + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q4, q4, #12 + veor q1, q1, q9 + vext.8 q11, q6, q6, #12 + veor q4, q4, q10 + vext.8 q12, q3, q3, #12 + veor q6, q6, q11 + vext.8 q13, q7, q7, #12 + veor q3, q3, q12 + vext.8 q14, q2, q2, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q2, q2, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q3 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q3, q3, #8 + veor q12, q12, q6 + vext.8 q9, q7, q7, #8 + veor q15, q15, q2 + vext.8 q3, q6, q6, #8 + veor q11, q11, q4 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q6, q2, q2, #8 + veor q11, q11, q5 + vext.8 q2, q4, q4, #8 + veor q5, q9, q13 + veor q4, q8, q12 + veor q3, q3, q11 + veor q7, q7, q15 + veor q6, q6, q14 + @ vmov q4, q8 + veor q2, q2, q10 + @ vmov q5, q9 + vldmia r6, {q12} @ LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq r6,r6,#0x10 + bne Lenc_loop + vldmia r6, {q12} @ LSRM0 + b Lenc_loop +.align 4 +Lenc_done: + vmov.i8 q8,#0x55 @ compose LBS0 + vmov.i8 q9,#0x33 @ compose LBS1 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q3, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q3, q3, q11 + vshr.u64 q10, q4, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q6 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q6, q6, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q4, q4, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q3, #2 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q3, q3, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q6 + veor q11, q11, q4 + vand q10, q10, q9 + vand q11, q11, q9 + veor q6, q6, q10 + vshl.u64 q10, q10, #2 + veor q4, q4, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q6, #4 + vshr.u64 q11, q4, #4 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q4, q4, q8 + veor q6, q6, q8 + veor q3, q3, q8 + veor q7, q7, q8 + veor q2, q2, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr + +#ifdef __thumb2__ +.thumb_func _bsaes_key_convert +#endif +.align 4 +_bsaes_key_convert: + adr r6,. + vld1.8 {q7}, [r4]! @ load round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,LM0 +#else + sub r6,r6,#_bsaes_key_convert-LM0 +#endif + vld1.8 {q15}, [r4]! @ load round 1 key + + vmov.i8 q8, #0x01 @ bit masks + vmov.i8 q9, #0x02 + vmov.i8 q10, #0x04 + vmov.i8 q11, #0x08 + vmov.i8 q12, #0x10 + vmov.i8 q13, #0x20 + vldmia r6, {q14} @ LM0 + +#ifdef __ARMEL__ + vrev32.8 q7, q7 + vrev32.8 q15, q15 +#endif + sub r5,r5,#1 + vstmia r12!, {q7} @ save round 0 key + b Lkey_loop + +.align 4 +Lkey_loop: + vtbl.8 d14,{q15},d28 + vtbl.8 d15,{q15},d29 + vmov.i8 q6, #0x40 + vmov.i8 q15, #0x80 + + vtst.8 q0, q7, q8 + vtst.8 q1, q7, q9 + vtst.8 q2, q7, q10 + vtst.8 q3, q7, q11 + vtst.8 q4, q7, q12 + vtst.8 q5, q7, q13 + vtst.8 q6, q7, q6 + vtst.8 q7, q7, q15 + vld1.8 {q15}, [r4]! @ load next round key + vmvn q0, q0 @ "pnot" + vmvn q1, q1 + vmvn q5, q5 + vmvn q6, q6 +#ifdef __ARMEL__ + vrev32.8 q15, q15 +#endif + subs r5,r5,#1 + vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key + bne Lkey_loop + + vmov.i8 q7,#0x63 @ compose L63 + @ don't save last round key + bx lr + +.globl _bsaes_cbc_encrypt +.private_extern _bsaes_cbc_encrypt +#ifdef __thumb2__ +.thumb_func _bsaes_cbc_encrypt +#endif +.align 5 +_bsaes_cbc_encrypt: + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. + + @ it is up to the caller to make sure we are called with enc == 0 + + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ IV is 1st arg on the stack + mov r2, r2, lsr#4 @ len in 16 byte blocks + sub sp, #0x10 @ scratch space to carry over the IV + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ sifze of bit-slices key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + vldmia sp, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia sp, {q7} +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, r3, #248 + vldmia r4, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia r4, {q7} + +.align 2 + +#endif + + vld1.8 {q15}, [r8] @ load IV + b Lcbc_dec_loop + +.align 4 +Lcbc_dec_loop: + subs r2, r2, #0x8 + bmi Lcbc_dec_loop_finish + + vld1.8 {q0,q1}, [r0]! @ load input + vld1.8 {q2,q3}, [r0]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + vld1.8 {q4,q5}, [r0]! + mov r5, r10 + vld1.8 {q6,q7}, [r0] + sub r0, r0, #0x60 + vstmia r9, {q15} @ put aside IV + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q14,q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + veor q5, q5, q14 + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + + b Lcbc_dec_loop + +Lcbc_dec_loop_finish: + adds r2, r2, #8 + beq Lcbc_dec_done + + @ Set up most parameters for the _bsaes_decrypt8 call. +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + mov r5, r10 + vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo Lcbc_dec_one + vld1.8 {q1}, [r0]! + beq Lcbc_dec_two + vld1.8 {q2}, [r0]! + cmp r2, #4 + blo Lcbc_dec_three + vld1.8 {q3}, [r0]! + beq Lcbc_dec_four + vld1.8 {q4}, [r0]! + cmp r2, #6 + blo Lcbc_dec_five + vld1.8 {q5}, [r0]! + beq Lcbc_dec_six + vld1.8 {q6}, [r0]! + sub r0, r0, #0x70 + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_six: + sub r0, r0, #0x60 + bl _bsaes_decrypt8 + vldmia r9,{q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_five: + sub r0, r0, #0x50 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0,q1}, [r1]! @ write output + veor q2, q2, q11 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_four: + sub r0, r0, #0x40 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_three: + sub r0, r0, #0x30 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + b Lcbc_dec_done +.align 4 +Lcbc_dec_two: + sub r0, r0, #0x20 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! @ reload input + veor q1, q1, q8 + vst1.8 {q0,q1}, [r1]! @ write output + b Lcbc_dec_done +.align 4 +Lcbc_dec_one: + sub r0, r0, #0x10 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output + +Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY + vmov.i32 q0, #0 + vmov.i32 q1, #0 +Lcbc_dec_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne Lcbc_dec_bzero +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + vst1.8 {q15}, [r8] @ return IV + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} + +.globl _bsaes_ctr32_encrypt_blocks +.private_extern _bsaes_ctr32_encrypt_blocks +#ifdef __thumb2__ +.thumb_func _bsaes_ctr32_encrypt_blocks +#endif +.align 5 +_bsaes_ctr32_encrypt_blocks: + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + + vld1.8 {q0}, [r8] @ load counter +#ifdef __APPLE__ + mov r8, #:lower16:(LREVM0SR-LM0) + add r8, r6, r8 +#else + add r8, r6, #LREVM0SR-LM0 @ borrow r8 +#endif + vldmia sp, {q4} @ load round0 key +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + +.align 2 + add r12, r3, #248 + vld1.8 {q0}, [r8] @ load counter + adrl r8, LREVM0SR @ borrow r8 + vldmia r12, {q4} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 q8,#1 @ compose 1<<96 + veor q9,q9,q9 + vrev32.8 q0,q0 + vext.8 q8,q9,q8,#4 + vrev32.8 q4,q4 + vadd.u32 q9,q8,q8 @ compose 2<<96 + vstmia sp, {q4} @ save adjusted round0 key + b Lctr_enc_loop + +.align 4 +Lctr_enc_loop: + vadd.u32 q10, q8, q9 @ compose 3<<96 + vadd.u32 q1, q0, q8 @ +1 + vadd.u32 q2, q0, q9 @ +2 + vadd.u32 q3, q0, q10 @ +3 + vadd.u32 q4, q1, q10 + vadd.u32 q5, q2, q10 + vadd.u32 q6, q3, q10 + vadd.u32 q7, q4, q10 + vadd.u32 q10, q5, q10 @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia sp, {q9} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x10 @ pass next round key +#else + add r4, r3, #264 +#endif + vldmia r8, {q8} @ LREVM0SR + mov r5, r10 @ pass rounds + vstmia r9, {q10} @ save next counter +#ifdef __APPLE__ + mov r6, #:lower16:(LREVM0SR-LSR) + sub r6, r8, r6 +#else + sub r6, r8, #LREVM0SR-LSR @ pass constants +#endif + + bl _bsaes_encrypt8_alt + + subs r2, r2, #8 + blo Lctr_enc_loop_done + + vld1.8 {q8,q9}, [r0]! @ load input + vld1.8 {q10,q11}, [r0]! + veor q0, q8 + veor q1, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q10 + veor q6, q11 + vld1.8 {q14,q15}, [r0]! + veor q3, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q7, q13 + veor q2, q14 + vst1.8 {q4}, [r1]! + veor q5, q15 + vst1.8 {q6}, [r1]! + vmov.i32 q8, #1 @ compose 1<<96 + vst1.8 {q3}, [r1]! + veor q9, q9, q9 + vst1.8 {q7}, [r1]! + vext.8 q8, q9, q8, #4 + vst1.8 {q2}, [r1]! + vadd.u32 q9,q8,q8 @ compose 2<<96 + vst1.8 {q5}, [r1]! + vldmia r9, {q0} @ load counter + + bne Lctr_enc_loop + b Lctr_enc_done + +.align 4 +Lctr_enc_loop_done: + add r2, r2, #8 + vld1.8 {q8}, [r0]! @ load input + veor q0, q8 + vst1.8 {q0}, [r1]! @ write output + cmp r2, #2 + blo Lctr_enc_done + vld1.8 {q9}, [r0]! + veor q1, q9 + vst1.8 {q1}, [r1]! + beq Lctr_enc_done + vld1.8 {q10}, [r0]! + veor q4, q10 + vst1.8 {q4}, [r1]! + cmp r2, #4 + blo Lctr_enc_done + vld1.8 {q11}, [r0]! + veor q6, q11 + vst1.8 {q6}, [r1]! + beq Lctr_enc_done + vld1.8 {q12}, [r0]! + veor q3, q12 + vst1.8 {q3}, [r1]! + cmp r2, #6 + blo Lctr_enc_done + vld1.8 {q13}, [r0]! + veor q7, q13 + vst1.8 {q7}, [r1]! + beq Lctr_enc_done + vld1.8 {q14}, [r0] + veor q2, q14 + vst1.8 {q2}, [r1]! + +Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +Lctr_enc_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne Lctr_enc_bzero +#else + vstmia sp, {q0,q1} +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return + + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. + +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S new file mode 100644 index 0000000000..fccd57d30e --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghash-armv4.S @@ -0,0 +1,600 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) + + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + + +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 + + +#ifdef __thumb2__ +.thumb_func rem_4bit_get +#endif +rem_4bit_get: +#if defined(__thumb2__) + adr r2,rem_4bit +#else + sub r2,pc,#8+32 @ &rem_4bit +#endif + b Lrem_4bit_got + nop + nop + + +.globl _gcm_ghash_4bit +.private_extern _gcm_ghash_4bit +#ifdef __thumb2__ +.thumb_func _gcm_ghash_4bit +#endif +.align 4 +_gcm_ghash_4bit: +#if defined(__thumb2__) + adr r12,rem_4bit +#else + sub r12,pc,#8+48 @ &rem_4bit +#endif + add r3,r2,r3 @ r3 to point at the end + stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too + + ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack + + ldrb r12,[r2,#15] + ldrb r14,[r0,#15] +Louter: + eor r12,r12,r14 + and r14,r12,#0xf0 + and r12,r12,#0x0f + mov r3,#14 + + add r7,r1,r12,lsl#4 + ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] + add r11,r1,r14 + ldrb r12,[r2,#14] + + and r14,r4,#0xf @ rem + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + add r14,r14,r14 + eor r4,r8,r4,lsr#4 + ldrh r8,[sp,r14] @ rem_4bit[rem] + eor r4,r4,r5,lsl#28 + ldrb r14,[r0,#14] + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + eor r12,r12,r14 + and r14,r12,#0xf0 + and r12,r12,#0x0f + eor r7,r7,r8,lsl#16 + +Linner: + add r11,r1,r12,lsl#4 + and r12,r4,#0xf @ rem + subs r3,r3,#1 + add r12,r12,r12 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] + eor r4,r8,r4,lsr#4 + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + ldrh r8,[sp,r12] @ rem_4bit[rem] + eor r6,r10,r6,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb r12,[r2,r3] + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + + add r11,r1,r14 + and r14,r4,#0xf @ rem + eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] + add r14,r14,r14 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + eor r4,r8,r4,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb r8,[r0,r3] + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + ldrh r9,[sp,r14] + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 +#ifdef __thumb2__ + it pl +#endif + eorpl r12,r12,r8 + eor r7,r11,r7,lsr#4 +#ifdef __thumb2__ + itt pl +#endif + andpl r14,r12,#0xf0 + andpl r12,r12,#0x0f + eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] + bpl Linner + + ldr r3,[sp,#32] @ re-load r3/end + add r2,r2,#16 + mov r14,r4 +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r4,r4 + str r4,[r0,#12] +#elif defined(__ARMEB__) + str r4,[r0,#12] +#else + mov r9,r4,lsr#8 + strb r4,[r0,#12+3] + mov r10,r4,lsr#16 + strb r9,[r0,#12+2] + mov r11,r4,lsr#24 + strb r10,[r0,#12+1] + strb r11,[r0,#12] +#endif + cmp r2,r3 +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r5,r5 + str r5,[r0,#8] +#elif defined(__ARMEB__) + str r5,[r0,#8] +#else + mov r9,r5,lsr#8 + strb r5,[r0,#8+3] + mov r10,r5,lsr#16 + strb r9,[r0,#8+2] + mov r11,r5,lsr#24 + strb r10,[r0,#8+1] + strb r11,[r0,#8] +#endif + +#ifdef __thumb2__ + it ne +#endif + ldrneb r12,[r2,#15] +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r6,r6 + str r6,[r0,#4] +#elif defined(__ARMEB__) + str r6,[r0,#4] +#else + mov r9,r6,lsr#8 + strb r6,[r0,#4+3] + mov r10,r6,lsr#16 + strb r9,[r0,#4+2] + mov r11,r6,lsr#24 + strb r10,[r0,#4+1] + strb r11,[r0,#4] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r7,r7 + str r7,[r0,#0] +#elif defined(__ARMEB__) + str r7,[r0,#0] +#else + mov r9,r7,lsr#8 + strb r7,[r0,#0+3] + mov r10,r7,lsr#16 + strb r9,[r0,#0+2] + mov r11,r7,lsr#24 + strb r10,[r0,#0+1] + strb r11,[r0,#0] +#endif + + bne Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +.globl _gcm_gmult_4bit +.private_extern _gcm_gmult_4bit +#ifdef __thumb2__ +.thumb_func _gcm_gmult_4bit +#endif +_gcm_gmult_4bit: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + ldrb r12,[r0,#15] + b rem_4bit_get +Lrem_4bit_got: + and r14,r12,#0xf0 + and r12,r12,#0x0f + mov r3,#14 + + add r7,r1,r12,lsl#4 + ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] + ldrb r12,[r0,#14] + + add r11,r1,r14 + and r14,r4,#0xf @ rem + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + add r14,r14,r14 + eor r4,r8,r4,lsr#4 + ldrh r8,[r2,r14] @ rem_4bit[rem] + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + and r14,r12,#0xf0 + eor r7,r7,r8,lsl#16 + and r12,r12,#0x0f + +Loop: + add r11,r1,r12,lsl#4 + and r12,r4,#0xf @ rem + subs r3,r3,#1 + add r12,r12,r12 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] + eor r4,r8,r4,lsr#4 + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + eor r5,r5,r6,lsl#28 + ldrh r8,[r2,r12] @ rem_4bit[rem] + eor r6,r10,r6,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb r12,[r0,r3] + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 + + add r11,r1,r14 + and r14,r4,#0xf @ rem + eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] + add r14,r14,r14 + ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] + eor r4,r8,r4,lsr#4 + eor r4,r4,r5,lsl#28 + eor r5,r9,r5,lsr#4 + ldrh r8,[r2,r14] @ rem_4bit[rem] + eor r5,r5,r6,lsl#28 + eor r6,r10,r6,lsr#4 + eor r6,r6,r7,lsl#28 + eor r7,r11,r7,lsr#4 +#ifdef __thumb2__ + itt pl +#endif + andpl r14,r12,#0xf0 + andpl r12,r12,#0x0f + eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] + bpl Loop +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r4,r4 + str r4,[r0,#12] +#elif defined(__ARMEB__) + str r4,[r0,#12] +#else + mov r9,r4,lsr#8 + strb r4,[r0,#12+3] + mov r10,r4,lsr#16 + strb r9,[r0,#12+2] + mov r11,r4,lsr#24 + strb r10,[r0,#12+1] + strb r11,[r0,#12] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r5,r5 + str r5,[r0,#8] +#elif defined(__ARMEB__) + str r5,[r0,#8] +#else + mov r9,r5,lsr#8 + strb r5,[r0,#8+3] + mov r10,r5,lsr#16 + strb r9,[r0,#8+2] + mov r11,r5,lsr#24 + strb r10,[r0,#8+1] + strb r11,[r0,#8] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r6,r6 + str r6,[r0,#4] +#elif defined(__ARMEB__) + str r6,[r0,#4] +#else + mov r9,r6,lsr#8 + strb r6,[r0,#4+3] + mov r10,r6,lsr#16 + strb r9,[r0,#4+2] + mov r11,r6,lsr#24 + strb r10,[r0,#4+1] + strb r11,[r0,#4] +#endif + +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev r7,r7 + str r7,[r0,#0] +#elif defined(__ARMEB__) + str r7,[r0,#0] +#else + mov r9,r7,lsr#8 + strb r7,[r0,#0+3] + mov r10,r7,lsr#16 + strb r9,[r0,#0+2] + mov r11,r7,lsr#24 + strb r10,[r0,#0+1] + strb r11,[r0,#0] +#endif + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.globl _gcm_init_neon +.private_extern _gcm_init_neon +#ifdef __thumb2__ +.thumb_func _gcm_init_neon +#endif +.align 4 +_gcm_init_neon: + vld1.64 d7,[r1]! @ load H + vmov.i8 q8,#0xe1 + vld1.64 d6,[r1] + vshl.i64 d17,#57 + vshr.u64 d16,#63 @ t0=0xc2....01 + vdup.8 q9,d7[7] + vshr.u64 d26,d6,#63 + vshr.s8 q9,#7 @ broadcast carry bit + vshl.i64 q3,q3,#1 + vand q8,q8,q9 + vorr d7,d26 @ H<<<=1 + veor q3,q3,q8 @ twisted H + vstmia r0,{q3} + + bx lr @ bx lr + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon +#ifdef __thumb2__ +.thumb_func _gcm_gmult_neon +#endif +.align 4 +_gcm_gmult_neon: + vld1.64 d7,[r0]! @ load Xi + vld1.64 d6,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + mov r3,#16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon +#ifdef __thumb2__ +.thumb_func _gcm_ghash_neon +#endif +.align 4 +_gcm_ghash_neon: + vld1.64 d1,[r0]! @ load Xi + vld1.64 d0,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + +Loop_neon: + vld1.64 d7,[r2]! @ load inp + vld1.64 d6,[r2]! +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + veor q3,q0 @ inp^=Xi +Lgmult_neon: + vext.8 d16, d26, d26, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d0, d6, d6, #1 @ B1 + vmull.p8 q0, d26, d0 @ E = A*B1 + vext.8 d18, d26, d26, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d26, d22 @ G = A*B2 + vext.8 d20, d26, d26, #3 @ A3 + veor q8, q8, q0 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d0, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q0, d26, d0 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d26, d22 @ K = A*B4 + veor q10, q10, q0 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q0, d26, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q0, q0, q8 + veor q0, q0, q10 + veor d6,d6,d7 @ Karatsuba pre-processing + vext.8 d16, d28, d28, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d2, d6, d6, #1 @ B1 + vmull.p8 q1, d28, d2 @ E = A*B1 + vext.8 d18, d28, d28, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d28, d22 @ G = A*B2 + vext.8 d20, d28, d28, #3 @ A3 + veor q8, q8, q1 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d2, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q1, d28, d2 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d28, d22 @ K = A*B4 + veor q10, q10, q1 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q1, d28, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q1, q1, q8 + veor q1, q1, q10 + vext.8 d16, d27, d27, #1 @ A1 + vmull.p8 q8, d16, d7 @ F = A1*B + vext.8 d4, d7, d7, #1 @ B1 + vmull.p8 q2, d27, d4 @ E = A*B1 + vext.8 d18, d27, d27, #2 @ A2 + vmull.p8 q9, d18, d7 @ H = A2*B + vext.8 d22, d7, d7, #2 @ B2 + vmull.p8 q11, d27, d22 @ G = A*B2 + vext.8 d20, d27, d27, #3 @ A3 + veor q8, q8, q2 @ L = E + F + vmull.p8 q10, d20, d7 @ J = A3*B + vext.8 d4, d7, d7, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q2, d27, d4 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d7, d7, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d27, d22 @ K = A*B4 + veor q10, q10, q2 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q2, d27, d7 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q2, q2, q8 + veor q2, q2, q10 + veor q1,q1,q0 @ Karatsuba post-processing + veor q1,q1,q2 + veor d1,d1,d2 + veor d4,d4,d3 @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 q9,q0,#57 @ 1st phase + vshl.i64 q10,q0,#62 + veor q10,q10,q9 @ + vshl.i64 q9,q0,#63 + veor q10, q10, q9 @ + veor d1,d1,d20 @ + veor d4,d4,d21 + + vshr.u64 q10,q0,#1 @ 2nd phase + veor q2,q2,q0 + veor q0,q0,q10 @ + vshr.u64 q10,q10,#6 + vshr.u64 q0,q0,#1 @ + veor q0,q0,q2 @ + veor q0,q0,q10 @ + + subs r3,#16 + bne Loop_neon + +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + sub r0,#16 + vst1.64 d1,[r0]! @ write out Xi + vst1.64 d0,[r0] + + bx lr @ bx lr + +#endif +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S new file mode 100644 index 0000000000..f5de67f037 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/ghashv8-armx32.S @@ -0,0 +1,256 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text + +.code 32 +#undef __thumb2__ +.globl _gcm_init_v8 +.private_extern _gcm_init_v8 +#ifdef __thumb2__ +.thumb_func _gcm_init_v8 +#endif +.align 4 +_gcm_init_v8: + vld1.64 {q9},[r1] @ load input H + vmov.i8 q11,#0xe1 + vshl.i64 q11,q11,#57 @ 0xc2.0 + vext.8 q3,q9,q9,#8 + vshr.u64 q10,q11,#63 + vdup.32 q9,d18[1] + vext.8 q8,q10,q11,#8 @ t0=0xc2....01 + vshr.u64 q10,q3,#63 + vshr.s32 q9,q9,#31 @ broadcast carry bit + vand q10,q10,q8 + vshl.i64 q3,q3,#1 + vext.8 q10,q10,q10,#8 + vand q8,q8,q9 + vorr q3,q3,q10 @ H<<<=1 + veor q12,q3,q8 @ twisted H + vst1.64 {q12},[r0]! @ store Htable[0] + + @ calculate H^2 + vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing +.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + veor q8,q8,q12 +.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 +.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q14,q0,q10 + + vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + veor q9,q9,q14 + vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed + vst1.64 {q13,q14},[r0] @ store Htable[1..2] + + bx lr + +.globl _gcm_gmult_v8 +.private_extern _gcm_gmult_v8 +#ifdef __thumb2__ +.thumb_func _gcm_gmult_v8 +#endif +.align 4 +_gcm_gmult_v8: + vld1.64 {q9},[r0] @ load Xi + vmov.i8 q11,#0xe1 + vld1.64 {q12,q13},[r1] @ load twisted H, ... + vshl.u64 q11,q11,#57 +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q3,q9,q9,#8 + +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + bx lr + +.globl _gcm_ghash_v8 +.private_extern _gcm_ghash_v8 +#ifdef __thumb2__ +.thumb_func _gcm_ghash_v8 +#endif +.align 4 +_gcm_ghash_v8: + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so + vld1.64 {q0},[r0] @ load [rotated] Xi + @ "[rotated]" means that + @ loaded value would have + @ to be rotated in order to + @ make it appear as in + @ algorithm specification + subs r3,r3,#32 @ see if r3 is 32 or larger + mov r12,#16 @ r12 is used as post- + @ increment for input pointer; + @ as loop is modulo-scheduled + @ r12 is zeroed just in time + @ to preclude overstepping + @ inp[len], which means that + @ last block[s] are actually + @ loaded twice, but last + @ copy is not processed + vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 + vmov.i8 q11,#0xe1 + vld1.64 {q14},[r1] + moveq r12,#0 @ is it time to zero r12? + vext.8 q0,q0,q0,#8 @ rotate Xi + vld1.64 {q8},[r2]! @ load [rotated] I[0] + vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant +#ifndef __ARMEB__ + vrev64.8 q8,q8 + vrev64.8 q0,q0 +#endif + vext.8 q3,q8,q8,#8 @ rotate I[0] + blo Lodd_tail_v8 @ r3 was less than 32 + vld1.64 {q9},[r2],r12 @ load [rotated] I[1] +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q7,q9,q9,#8 + veor q3,q3,q0 @ I[i]^=Xi +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q9,q9,q7 @ Karatsuba pre-processing +.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + vext.8 q10,q3,q3,#8 + subs r3,r3,#32 @ is there more data? +.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + movlo r12,#0 @ is it time to zero r12? + +.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + veor q10,q10,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + veor q0,q0,q4 @ accumulate +.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] + + veor q2,q2,q6 + moveq r12,#0 @ is it time to zero r12? + veor q1,q1,q5 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] +#ifndef __ARMEB__ + vrev64.8 q8,q8 +#endif + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + vext.8 q7,q9,q9,#8 + vext.8 q3,q8,q8,#8 + veor q0,q1,q10 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q3,q3,q2 @ accumulate q3 early + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q3,q3,q10 + veor q9,q9,q7 @ Karatsuba pre-processing + veor q3,q3,q0 +.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + bhs Loop_mod2x_v8 @ there was at least 32 more bytes + + veor q2,q2,q10 + vext.8 q3,q8,q8,#8 @ re-construct q3 + adds r3,r3,#32 @ re-construct r3 + veor q0,q0,q2 @ re-construct q0 + beq Ldone_v8 @ is r3 zero? +Lodd_tail_v8: + vext.8 q10,q0,q0,#8 + veor q3,q3,q0 @ inp^=Xi + veor q9,q8,q10 @ q9 is rotated inp^Xi + +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +Ldone_v8: +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so + bx lr + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S new file mode 100644 index 0000000000..82ac8df4fc --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha1-armv4-large.S @@ -0,0 +1,1518 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +#include + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl _sha1_block_data_order +.private_extern _sha1_block_data_order +#ifdef __thumb2__ +.thumb_func _sha1_block_data_order +#endif + +.align 5 +_sha1_block_data_order: +#if __ARM_MAX_ARCH__>=7 +Lsha1_block: + adr r3,Lsha1_block + ldr r12,LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA1 + bne LARMv8 + tst r12,#ARMV7_NEON + bne LNEON +#endif + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +Lloop: + ldr r8,LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp @ preserve carry +#endif + bne L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,LK_40_59 + sub sp,sp,#20*4 @ [+2] +L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b L_20_39_or_60_79 @ [+4], spare 300 bytes +L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + + +.align 5 +LK_00_19:.word 0x5a827999 +LK_20_39:.word 0x6ed9eba1 +LK_40_59:.word 0x8f1bbcdc +LK_60_79:.word 0xca62c1d6 +#if __ARM_MAX_ARCH__>=7 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lsha1_block +#endif +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 5 +#if __ARM_MAX_ARCH__>=7 + + + +#ifdef __thumb2__ +.thumb_func sha1_block_data_order_neon +#endif +.align 4 +sha1_block_data_order_neon: +LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov r14,sp + sub r12,sp,#64 + adr r8,LK_00_19 + bic r12,r12,#15 @ align for 128-bit stores + + ldmia r0,{r3,r4,r5,r6,r7} @ load context + mov sp,r12 @ alloca + + vld1.8 {q0,q1},[r1]! @ handles unaligned + veor q15,q15,q15 + vld1.8 {q2,q3},[r1]! + vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 + vrev32.8 q0,q0 @ yes, even on + vrev32.8 q1,q1 @ big-endian... + vrev32.8 q2,q2 + vadd.i32 q8,q0,q14 + vrev32.8 q3,q3 + vadd.i32 q9,q1,q14 + vst1.32 {q8},[r12,:128]! + vadd.i32 q10,q2,q14 + vst1.32 {q9},[r12,:128]! + vst1.32 {q10},[r12,:128]! + ldr r9,[sp] @ big RAW stall + +Loop_neon: + vext.8 q8,q0,q1,#8 + bic r10,r6,r4 + add r7,r7,r9 + and r11,r5,r4 + vadd.i32 q13,q3,q14 + ldr r9,[sp,#4] + add r7,r7,r3,ror#27 + vext.8 q12,q3,q15,#4 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q8,q8,q0 + bic r10,r5,r3 + add r6,r6,r9 + veor q12,q12,q2 + and r11,r4,r3 + ldr r9,[sp,#8] + veor q12,q12,q8 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q13,q15,q12,#4 + bic r10,r4,r7 + add r5,r5,r9 + vadd.i32 q8,q12,q12 + and r11,r3,r7 + ldr r9,[sp,#12] + vsri.32 q8,q12,#31 + add r5,r5,r6,ror#27 + eor r11,r11,r10 + mov r7,r7,ror#2 + vshr.u32 q12,q13,#30 + add r5,r5,r11 + bic r10,r3,r6 + vshl.u32 q13,q13,#2 + add r4,r4,r9 + and r11,r7,r6 + veor q8,q8,q12 + ldr r9,[sp,#16] + add r4,r4,r5,ror#27 + veor q8,q8,q13 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q9,q1,q2,#8 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + vadd.i32 q13,q8,q14 + ldr r9,[sp,#20] + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r4,ror#27 + vext.8 q12,q8,q15,#4 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + veor q9,q9,q1 + bic r10,r6,r4 + add r7,r7,r9 + veor q12,q12,q3 + and r11,r5,r4 + ldr r9,[sp,#24] + veor q12,q12,q9 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q13,q15,q12,#4 + bic r10,r5,r3 + add r6,r6,r9 + vadd.i32 q9,q12,q12 + and r11,r4,r3 + ldr r9,[sp,#28] + vsri.32 q9,q12,#31 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + mov r3,r3,ror#2 + vshr.u32 q12,q13,#30 + add r6,r6,r11 + bic r10,r4,r7 + vshl.u32 q13,q13,#2 + add r5,r5,r9 + and r11,r3,r7 + veor q9,q9,q12 + ldr r9,[sp,#32] + add r5,r5,r6,ror#27 + veor q9,q9,q13 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q10,q2,q3,#8 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + vadd.i32 q13,q9,q14 + ldr r9,[sp,#36] + add r4,r4,r5,ror#27 + vext.8 q12,q9,q15,#4 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q10,q10,q2 + bic r10,r7,r5 + add r3,r3,r9 + veor q12,q12,q8 + and r11,r6,r5 + ldr r9,[sp,#40] + veor q12,q12,q10 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q13,q15,q12,#4 + bic r10,r6,r4 + add r7,r7,r9 + vadd.i32 q10,q12,q12 + and r11,r5,r4 + ldr r9,[sp,#44] + vsri.32 q10,q12,#31 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + mov r4,r4,ror#2 + vshr.u32 q12,q13,#30 + add r7,r7,r11 + bic r10,r5,r3 + vshl.u32 q13,q13,#2 + add r6,r6,r9 + and r11,r4,r3 + veor q10,q10,q12 + ldr r9,[sp,#48] + add r6,r6,r7,ror#27 + veor q10,q10,q13 + eor r11,r11,r10 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q11,q3,q8,#8 + bic r10,r4,r7 + add r5,r5,r9 + and r11,r3,r7 + vadd.i32 q13,q10,q14 + ldr r9,[sp,#52] + add r5,r5,r6,ror#27 + vext.8 q12,q10,q15,#4 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q11,q11,q3 + bic r10,r3,r6 + add r4,r4,r9 + veor q12,q12,q9 + and r11,r7,r6 + ldr r9,[sp,#56] + veor q12,q12,q11 + add r4,r4,r5,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q13,q15,q12,#4 + bic r10,r7,r5 + add r3,r3,r9 + vadd.i32 q11,q12,q12 + and r11,r6,r5 + ldr r9,[sp,#60] + vsri.32 q11,q12,#31 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + vshr.u32 q12,q13,#30 + add r3,r3,r11 + bic r10,r6,r4 + vshl.u32 q13,q13,#2 + add r7,r7,r9 + and r11,r5,r4 + veor q11,q11,q12 + ldr r9,[sp,#0] + add r7,r7,r3,ror#27 + veor q11,q11,q13 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q10,q11,#8 + bic r10,r5,r3 + add r6,r6,r9 + and r11,r4,r3 + veor q0,q0,q8 + ldr r9,[sp,#4] + add r6,r6,r7,ror#27 + veor q0,q0,q1 + eor r11,r11,r10 + mov r3,r3,ror#2 + vadd.i32 q13,q11,q14 + add r6,r6,r11 + bic r10,r4,r7 + veor q12,q12,q0 + add r5,r5,r9 + and r11,r3,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r11,r10 + mov r7,r7,ror#2 + vsli.32 q0,q12,#2 + add r5,r5,r11 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + ldr r9,[sp,#12] + add r4,r4,r5,ror#27 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + ldr r9,[sp,#16] + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q11,q0,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#20] + veor q1,q1,q9 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q1,q1,q2 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q0,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q1 + ldr r9,[sp,#24] + eor r11,r10,r4 + vshr.u32 q1,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q1,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#28] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#32] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q0,q1,#8 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#36] + veor q2,q2,q10 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + veor q2,q2,q3 + mov r5,r5,ror#2 + add r3,r3,r11 + vadd.i32 q13,q1,q14 + eor r10,r4,r6 + vld1.32 {d28[],d29[]},[r8,:32]! + add r7,r7,r9 + veor q12,q12,q2 + ldr r9,[sp,#40] + eor r11,r10,r5 + vshr.u32 q2,q12,#30 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + vst1.32 {q13},[r12,:128]! + add r7,r7,r11 + eor r10,r3,r5 + vsli.32 q2,q12,#2 + add r6,r6,r9 + ldr r9,[sp,#44] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#48] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q1,q2,#8 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r7 + add r4,r4,r5,ror#27 + veor q3,q3,q8 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q2,q14 + eor r10,r5,r7 + add r3,r3,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r6 + vshr.u32 q3,q12,#30 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vst1.32 {q13},[r12,:128]! + add r3,r3,r11 + eor r10,r4,r6 + vsli.32 q3,q12,#2 + add r7,r7,r9 + ldr r9,[sp,#60] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#0] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q2,q3,#8 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#4] + veor q8,q8,q0 + eor r11,r10,r3 + add r5,r5,r6,ror#27 + veor q8,q8,q9 + mov r7,r7,ror#2 + add r5,r5,r11 + vadd.i32 q13,q3,q14 + eor r10,r6,r3 + add r4,r4,r9 + veor q12,q12,q8 + ldr r9,[sp,#8] + eor r11,r10,r7 + vshr.u32 q8,q12,#30 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + add r4,r4,r11 + eor r10,r5,r7 + vsli.32 q8,q12,#2 + add r3,r3,r9 + ldr r9,[sp,#12] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#16] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q3,q8,#8 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#20] + veor q9,q9,q1 + eor r11,r10,r4 + add r6,r6,r7,ror#27 + veor q9,q9,q10 + mov r3,r3,ror#2 + add r6,r6,r11 + vadd.i32 q13,q8,q14 + eor r10,r7,r4 + add r5,r5,r9 + veor q12,q12,q9 + ldr r9,[sp,#24] + eor r11,r10,r3 + vshr.u32 q9,q12,#30 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + vst1.32 {q13},[r12,:128]! + add r5,r5,r11 + eor r10,r6,r3 + vsli.32 q9,q12,#2 + add r4,r4,r9 + ldr r9,[sp,#28] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#32] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q8,q9,#8 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#36] + veor q10,q10,q2 + add r7,r7,r3,ror#27 + eor r11,r5,r6 + veor q10,q10,q11 + add r7,r7,r10 + and r11,r11,r4 + vadd.i32 q13,q9,q14 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q12,q12,q10 + add r6,r6,r9 + and r10,r4,r5 + vshr.u32 q10,q12,#30 + ldr r9,[sp,#40] + add r6,r6,r7,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r4,r5 + add r6,r6,r10 + vsli.32 q10,q12,#2 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#44] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#48] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q9,q10,#8 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#52] + veor q11,q11,q3 + add r3,r3,r4,ror#27 + eor r11,r6,r7 + veor q11,q11,q0 + add r3,r3,r10 + and r11,r11,r5 + vadd.i32 q13,q10,q14 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + veor q12,q12,q11 + add r7,r7,r9 + and r10,r5,r6 + vshr.u32 q11,q12,#30 + ldr r9,[sp,#56] + add r7,r7,r3,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r5,r6 + add r7,r7,r10 + vsli.32 q11,q12,#2 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#60] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#0] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q10,q11,#8 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#4] + veor q0,q0,q8 + add r4,r4,r5,ror#27 + eor r11,r7,r3 + veor q0,q0,q1 + add r4,r4,r10 + and r11,r11,r6 + vadd.i32 q13,q11,q14 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q12,q12,q0 + add r3,r3,r9 + and r10,r6,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r3,r3,r4,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r6,r7 + add r3,r3,r10 + vsli.32 q0,q12,#2 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#12] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#16] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q11,q0,#8 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#20] + veor q1,q1,q9 + add r5,r5,r6,ror#27 + eor r11,r3,r4 + veor q1,q1,q2 + add r5,r5,r10 + and r11,r11,r7 + vadd.i32 q13,q0,q14 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q12,q12,q1 + add r4,r4,r9 + and r10,r7,r3 + vshr.u32 q1,q12,#30 + ldr r9,[sp,#24] + add r4,r4,r5,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r7,r3 + add r4,r4,r10 + vsli.32 q1,q12,#2 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#28] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#32] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q0,q1,#8 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#36] + veor q2,q2,q10 + add r6,r6,r7,ror#27 + eor r11,r4,r5 + veor q2,q2,q3 + add r6,r6,r10 + and r11,r11,r3 + vadd.i32 q13,q1,q14 + mov r3,r3,ror#2 + add r6,r6,r11 + veor q12,q12,q2 + add r5,r5,r9 + and r10,r3,r4 + vshr.u32 q2,q12,#30 + ldr r9,[sp,#40] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r3,r4 + add r5,r5,r10 + vsli.32 q2,q12,#2 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#44] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#48] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q1,q2,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q3,q3,q8 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q2,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r4 + vshr.u32 q3,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q3,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#60] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#0] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q3,q14 + eor r10,r5,r7 + add r3,r3,r9 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + teq r1,r2 + sub r8,r8,#16 + it eq + subeq r1,r1,#64 + vld1.8 {q0,q1},[r1]! + ldr r9,[sp,#4] + eor r11,r10,r6 + vld1.8 {q2,q3},[r1]! + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + eor r10,r4,r6 + vrev32.8 q0,q0 + add r7,r7,r9 + ldr r9,[sp,#8] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#12] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#16] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vrev32.8 q1,q1 + eor r10,r6,r3 + add r4,r4,r9 + vadd.i32 q8,q0,q14 + ldr r9,[sp,#20] + eor r11,r10,r7 + vst1.32 {q8},[r12,:128]! + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#24] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#28] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#32] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vrev32.8 q2,q2 + eor r10,r7,r4 + add r5,r5,r9 + vadd.i32 q9,q1,q14 + ldr r9,[sp,#36] + eor r11,r10,r3 + vst1.32 {q9},[r12,:128]! + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#40] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#44] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#48] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vrev32.8 q3,q3 + eor r10,r3,r5 + add r6,r6,r9 + vadd.i32 q10,q2,q14 + ldr r9,[sp,#52] + eor r11,r10,r4 + vst1.32 {q10},[r12,:128]! + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#56] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#60] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + ldmia r0,{r9,r10,r11,r12} @ accumulate context + add r3,r3,r9 + ldr r9,[r0,#16] + add r4,r4,r10 + add r5,r5,r11 + add r6,r6,r12 + it eq + moveq sp,r14 + add r7,r7,r9 + it ne + ldrne r9,[sp] + stmia r0,{r3,r4,r5,r6,r7} + itt ne + addne r12,sp,#3*16 + bne Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} + +#endif +#if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + +#ifdef __thumb2__ +.thumb_func sha1_block_data_order_armv8 +#endif +.align 5 +sha1_block_data_order_armv8: +LARMv8: + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + + veor q1,q1,q1 + adr r3,LK_00_19 + vld1.32 {q0},[r0]! + vld1.32 {d2[0]},[r0] + sub r0,r0,#16 + vld1.32 {d16[],d17[]},[r3,:32]! + vld1.32 {d18[],d19[]},[r3,:32]! + vld1.32 {d20[],d21[]},[r3,:32]! + vld1.32 {d22[],d23[]},[r3,:32] + +Loop_v8: + vld1.8 {q4,q5},[r1]! + vld1.8 {q6,q7},[r1]! + vrev32.8 q4,q4 + vrev32.8 q5,q5 + + vadd.i32 q12,q8,q4 + vrev32.8 q6,q6 + vmov q14,q0 @ offload + subs r2,r2,#1 + + vadd.i32 q13,q8,q5 + vrev32.8 q7,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 + INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 + vadd.i32 q12,q8,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q8,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q8,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q9,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q9,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q10,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q10,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q11,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q11,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q11,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q7 + + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + + vadd.i32 q1,q1,q2 + vadd.i32 q0,q0,q14 + bne Loop_v8 + + vst1.32 {q0},[r0]! + vst1.32 {d2[0]},[r0] + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + bx lr @ bx lr + +#endif +#if __ARM_MAX_ARCH__>=7 +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S new file mode 100644 index 0000000000..0cf36482d4 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha256-armv4.S @@ -0,0 +1,2846 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifndef __KERNEL__ +# include +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) + + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + + +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lsha256_block_data_order +#endif +.align 5 + +.globl _sha256_block_data_order +.private_extern _sha256_block_data_order +#ifdef __thumb2__ +.thumb_func _sha256_block_data_order +#endif +_sha256_block_data_order: +Lsha256_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ _sha256_block_data_order +#else + adr r3,Lsha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA256 + bne LARMv8 + tst r12,#ARMV7_NEON + bne LNEON +#endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.globl _sha256_block_data_order_neon +.private_extern _sha256_block_data_order_neon +#ifdef __thumb2__ +.thumb_func _sha256_block_data_order_neon +#endif +.align 5 +.skip 16 +_sha256_block_data_order_neon: +LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + sub r11,sp,#16*4+16 + adr r14,K256 + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b L_00_48 + +.align 4 +L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8,r9,r10,r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne L_00_48 + + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} + +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +#ifdef __thumb2__ +.thumb_func sha256_block_data_order_armv8 +#endif +.align 5 +sha256_block_data_order_armv8: +LARMv8: + vld1.32 {q0,q1},[r0] + sub r3,r3,#256+32 + add r2,r1,r2,lsl#6 @ len to point at the end of inp + b Loop_v8 + +.align 4 +Loop_v8: + vld1.8 {q8,q9},[r1]! + vld1.8 {q10,q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + it ne + bne Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr + +#endif +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S new file mode 100644 index 0000000000..21913cb2ba --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/sha512-armv4.S @@ -0,0 +1,1899 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA512 block procedure for ARMv4. September 2007. + +@ This code is ~4.5 (four and a half) times faster than code generated +@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +@ Xscale PXA250 core]. +@ +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 6% improvement on +@ Cortex A8 core and ~40 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 7% +@ improvement on Coxtex A8 core and ~38 cycles per byte. + +@ March 2011. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process +@ one byte in 23.3 cycles or ~60% faster than integer-only code. + +@ August 2012. +@ +@ Improve NEON performance by 12% on Snapdragon S4. In absolute +@ terms it's 22.6 cycles per byte, which is disappointing result. +@ Technical writers asserted that 3-way S4 pipeline can sustain +@ multiple NEON instructions per cycle, but dual NEON issue could +@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +@ for further details. On side note Cortex-A15 processes one byte in +@ 16 cycles. + +@ Byte order [in]dependence. ========================================= +@ +@ Originally caller was expected to maintain specific *dword* order in +@ h[0-7], namely with most significant dword at *lower* address, which +@ was reflected in below two parameters as 0 and 4. Now caller is +@ expected to maintain native byte order for whole 64-bit values. +#ifndef __KERNEL__ +# include +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. + + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + + +.align 5 +K512: + WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) + WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) + WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) + WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) + WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) + WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) + WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) + WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) + WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) + WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) + WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) + WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) + WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) + WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) + WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) + WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) + WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) + WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) + WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) + WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) + WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) + WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) + WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) + WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) + WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) + WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) + WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) + WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) + WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) + WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) + WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) + WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) + WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) + WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) + WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) + WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) + WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) + WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) + WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) + WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +LOPENSSL_armcap: +.word OPENSSL_armcap_P-Lsha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.globl _sha512_block_data_order +.private_extern _sha512_block_data_order +#ifdef __thumb2__ +.thumb_func _sha512_block_data_order +#endif +_sha512_block_data_order: +Lsha512_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ _sha512_block_data_order +#else + adr r3,Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + bne LNEON +#endif + add r2,r1,r2,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + sub r14,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +L00_15: +#if __ARM_ARCH__<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif + +#if __ARM_MAX_ARCH__>=7 + + + +.globl _sha512_block_data_order_neon +.private_extern _sha512_block_data_order_neon +#ifdef __thumb2__ +.thumb_func _sha512_block_data_order_neon +#endif +.align 4 +_sha512_block_data_order_neon: +LNEON: + dmb @ errata #451034 on early Cortex A8 + add r2,r1,r2,lsl#7 @ len to point at the end of inp + adr r3,K512 + VFP_ABI_PUSH + vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context +Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 0>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 1>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 2>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 3>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 4>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 5>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 6>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 7>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 8>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 9>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 10>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 11>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 12>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 13>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 14>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 15>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + mov r12,#4 +L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 17>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 19>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 21>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 23>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 25>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 27>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 29>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 31>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + bne L16_79_neon + + vadd.i64 d16,d30 @ h+=Maj from the past + vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne Loop_neon + + VFP_ABI_POP + bx lr @ .word 0xe12fff1e + +#endif +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P +#endif +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S new file mode 100644 index 0000000000..6aead7cac2 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/fipsmodule/vpaes-armv7.S @@ -0,0 +1,1265 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + + + + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + + +.align 7 @ totally strategic alignment +_vpaes_consts: +Lk_mc_forward:@ mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward:@ mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr:@ sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +Lk_inv:@ inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt:@ input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo:@ sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1:@ sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2:@ sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_preheat +#endif +.align 4 +_vpaes_preheat: + adr r10, Lk_inv + vmov.i8 q9, #0x0f @ Lk_s0F + vld1.64 {q10,q11}, [r10]! @ Lk_inv + add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo + vld1.64 {q12,q13}, [r10]! @ Lk_sb1 + vld1.64 {q14,q15}, [r10] @ Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [r2] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_encrypt_core +#endif +.align 4 +_vpaes_encrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + adr r11, Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 d3, {q2}, d3 + vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 d5, {q3}, d1 + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b Lenc_entry + +.align 4 +Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 d9, {q13}, d5 + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 d1, {q12}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 d11, {q15}, d5 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 d5, {q14}, d7 + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 d7, {q0}, d3 + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 d11, {q0}, d9 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 d9, {q3}, d3 + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 d11, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16 + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 d5, {q0}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 d1, {q2}, d3 + bx lr + + +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt +#ifdef __thumb2__ +.thumb_func _vpaes_encrypt +#endif +.align 4 +_vpaes_encrypt: + @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack + @ alignment. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_encrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + + +@ +@ Decryption stuff +@ + +.align 4 +_vpaes_decrypt_consts: +Lk_dipt:@ decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo:@ decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9:@ decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd:@ decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb:@ decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe:@ decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + + +@@ +@@ Decryption core +@@ +@@ Same API as encryption core, except it clobbers q12-q15 rather than using +@@ the values from _vpaes_preheat. q9-q11 must still be set from +@@ _vpaes_preheat. +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_decrypt_core +#endif +.align 4 +_vpaes_decrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + + @ This function performs shuffles with various constants. The x86_64 + @ version loads them on-demand into %xmm0-%xmm5. This does not work well + @ for ARMv7 because those registers are shuffle destinations. The ARMv8 + @ version preloads those constants into registers, but ARMv7 has half + @ the registers to work with. Instead, we load them on-demand into + @ q12-q15, registers normally use for preloaded constants. This is fine + @ because decryption doesn't use those constants. The values are + @ constant, so this does not interfere with potential 2x optimizations. + adr r7, Lk_dipt + + vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo + lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 + eor r11, r11, #0x30 @ xor $0x30, %r11 + adr r10, Lk_sr + and r11, r11, #0x30 @ and $0x30, %r11 + add r11, r11, r10 + adr r10, Lk_mc_forward+48 + + vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q12}, d3 + vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5 + @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q13}, d1 + veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Ldec_entry ends with a bnz instruction which is normally paired with + @ subs in .Ldec_loop. + tst r8, r8 + b Ldec_entry + +.align 4 +Ldec_loop: +@ +@ Inverse mix columns +@ + + @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of + @ the function. + adr r10, Lk_dsb9 + vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + vtbl.8 d9, {q12}, d5 + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + vtbl.8 d3, {q13}, d7 + veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 + + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + @ Load sbb* ahead of time. + vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu + @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + vtbl.8 d3, {q15}, d7 + @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu + @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet + + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + vtbl.8 d9, {q12}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + vtbl.8 d3, {q13}, d7 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + vtbl.8 d3, {q15}, d7 + vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + subs r8, r8, #1 @ sub $1,%rax # nr-- + +Ldec_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 + bne Ldec_loop + + @ middle of last round + + adr r10, Lk_dsbo + + @ Write to q1 rather than q4 to avoid overlapping table and destination. + vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + @ Write to q2 rather than q1 to avoid overlapping table and destination. + vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + vtbl.8 d3, {q2}, d7 + vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + @ Write to q1 rather than q0 so the table and destination registers + @ below do not overlap. + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A + vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 + vtbl.8 d1, {q1}, d5 + bx lr + + +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt +#ifdef __thumb2__ +.thumb_func _vpaes_decrypt +#endif +.align 4 +_vpaes_decrypt: + @ _vpaes_decrypt_core uses r7-r11. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_decrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ + +.align 4 +_vpaes_key_consts: +Lk_dksd:@ decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb:@ decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9:@ decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon:@ rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt:@ output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew:@ deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + +#ifdef __thumb2__ +.thumb_func _vpaes_key_preheat +#endif +.align 4 +_vpaes_key_preheat: + adr r11, Lk_rcon + vmov.i8 q12, #0x5b @ Lk_s63 + adr r10, Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 q9, #0x0f @ Lk_s0F + vld1.64 {q10,q11}, [r10] @ Lk_inv + vld1.64 {q8}, [r11] @ Lk_rcon + bx lr + + +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_core +#endif +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + tst r3, r3 + bne Lschedule_am_decrypting + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + @ decrypting, output zeroth round key after shiftrows + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q4}, d3 + vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) + eor r8, r8, #0x30 @ xor $0x30, %r8 + +Lschedule_go: + cmp r1, #192 @ cmp $192, %esi + bhi Lschedule_256 + beq Lschedule_192 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +Lschedule_128: + mov r0, #10 @ mov $10, %esi + +Loop_schedule_128: + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b Loop_schedule_128 + +@@ +@@ .aes_schedule_192 +@@ +@@ 192-bit specific part of key schedule. +@@ +@@ The main body of this schedule is the same as the 128-bit +@@ schedule, but with more smearing. The long, high side is +@@ stored in q7 as before, and the short, low side is in +@@ the high bits of q6. +@@ +@@ This schedule is somewhat nastier, however, because each +@@ round produces 192 bits of key material, or 1.5 round keys. +@@ Therefore, on each cycle we do 2 rounds and produce 3 round +@@ keys. +@@ +.align 4 +Lschedule_192: + sub r0, r0, #8 + vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform @ input transform + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part + vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 + @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov r0, #4 @ mov $4, %esi + +Loop_schedule_192: + bl _vpaes_schedule_round + vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle @ save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle @ save key n+1 + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq Lschedule_mangle_last + bl _vpaes_schedule_mangle @ save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +Lschedule_256: + vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov r0, #7 @ mov $7, %esi + +Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew + tst r3, r3 + bne Lschedule_mangle_last_dec + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform + add r2, r2, #32 @ add $32, %rdx + vmov q2, q0 + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 d1, {q2}, d3 + +Lschedule_mangle_last_dec: + sub r2, r2, #16 @ add $-16, %rdx + veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return + + +@@ +@@ .aes_schedule_192_smear +@@ +@@ Smear the short, low side in the 192-bit key schedule. +@@ +@@ Inputs: +@@ q7: high side, b a x y +@@ q6: low side, d c 0 0 +@@ +@@ Outputs: +@@ q6: b+c+d b+c 0 0 +@@ q0: b+c+d b+c b a +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_192_smear +#endif +.align 4 +_vpaes_schedule_192_smear: + vmov.i8 q1, #0 + vdup.32 q0, d15[1] + vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + vmov q0, q6 @ vmovdqa %xmm6, %xmm0 + vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + bx lr + + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_round +#endif +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 + vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 d7, {q10}, d7 + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 d5, {q10}, d9 + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 d9, {q15}, d7 + vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 d3, {q14}, d5 + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr + + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_transform +#endif +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d3 + vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q15}, d1 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr + + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +#ifdef __thumb2__ +.thumb_func _vpaes_schedule_mangle +#endif +.align 4 +_vpaes_schedule_mangle: + tst r3, r3 + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5 + bne Lschedule_mangle_dec + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4 + add r2, r2, #16 @ add $16, %rdx + vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 d9, {q2}, d11 + vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 d3, {q4}, d11 + vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 d7, {q1}, d11 + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + @ inverse mix columns + adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11 + vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi + vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 + @ vmovdqa 0x10(%r11), %xmm3 + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dksb ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 + @ vmovdqa 0x30(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 + @ vmovdqa 0x50(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 + @ vmovdqa 0x70(%r11), %xmm4 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 + vtbl.8 d9, {q15}, d3 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 + + sub r2, r2, #16 @ add $-16, %rdx + +Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d5, {q3}, d3 + add r8, r8, #64-16 @ add $-16, %r8 + and r8, r8, #~(1<<6) @ and $0x30, %r8 + vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) + bx lr + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key +#ifdef __thumb2__ +.thumb_func _vpaes_set_encrypt_key +#endif +.align 4 +_vpaes_set_encrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov r3, #0 @ mov $0,%ecx + mov r8, #0x30 @ mov $0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + + +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key +#ifdef __thumb2__ +.thumb_func _vpaes_set_decrypt_key +#endif +.align 4 +_vpaes_set_decrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl r9, r9, #4 @ shl $4,%eax + add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx + add r2, r2, r9 + + mov r3, #1 @ mov $1,%ecx + lsr r8, r1, #1 @ shr $1,%r8d + and r8, r8, #32 @ and $32,%r8d + eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + + +@ Additional constants for converting to bsaes. + +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) +Lk_opt_then_skew: +.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b +.quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation +@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 +@ becomes 0x22334411 and then 0x11443322. +Lk_decrypt_transform: +.quad 0x0704050603000102, 0x0f0c0d0e0b08090a + + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl _vpaes_encrypt_key_to_bsaes +.private_extern _vpaes_encrypt_key_to_bsaes +#ifdef __thumb2__ +.thumb_func _vpaes_encrypt_key_to_bsaes +#endif +.align 4 +_vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {q12}, [r2] + vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64 + adr r11, Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +Loop_enc_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + vmov q0, q1 + vtbl.8 d4, {q1}, d24 + vtbl.8 d5, {q1}, d25 + veor q0, q0, q2 + vtbl.8 d2, {q2}, d24 + vtbl.8 d3, {q2}, d25 + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, q10 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + b Loop_enc_key_to_bsaes + +Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ q11, not q10. + veor q0, q0, q11 + vrev32.8 q0, q0 + vst1.64 {q0}, [r0] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return + + +@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); +.globl _vpaes_decrypt_key_to_bsaes +.private_extern _vpaes_decrypt_key_to_bsaes +#ifdef __thumb2__ +.thumb_func _vpaes_decrypt_key_to_bsaes +#endif +.align 4 +_vpaes_decrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. Note vpaes + @ computes the decryption key schedule in reverse. Additionally, + @ aes-x86_64.pl shares some transformations, so we must only partially + @ invert vpaes's transformations. In general, vpaes computes in a + @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of + @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is + @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + adr r2, Lk_decrypt_transform + adr r3, Lk_sr+0x30 + adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform. + vld1.64 {q12}, [r2] @ Reuse q12 from encryption. + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ Undo the basis change and reapply the S-box affine transform. See + @ .Lschedule_mangle_last. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ See _vpaes_schedule_mangle for the transform on the middle keys. Note + @ it simultaneously inverts MixColumns and the S-box affine transform. + @ See .Lk_dksd through .Lk_dks9. +Loop_dec_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going + @ forwards cancels inverting for which direction we cycle r3. We use r3 + @ rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #64-16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq Loop_dec_key_to_bsaes_last + + @ Undo the basis change and reapply the S-box affine transform. + bl _vpaes_schedule_transform + + @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We + @ combine the two operations in .Lk_decrypt_transform. + @ + @ TODO(davidben): Where does the rotation come from? + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + + vst1.64 {q1}, [r0]! + b Loop_dec_key_to_bsaes + +Loop_dec_key_to_bsaes_last: + @ The final key only inverts ShiftRows (already done in the loop). See + @ .Lschedule_am_decrypting. Its basis is not transformed. + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks +#ifdef __thumb2__ +.thumb_func _vpaes_ctr32_encrypt_blocks +#endif +.align 4 +_vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7,r8,r9,r10,r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + cmp r2, #0 + @ r8 is passed on the stack. + ldr r8, [ip] + beq Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. + mov r9, r3 + mov r3, r2 + mov r2, r9 + + @ Load the IV and counter portion. + ldr r7, [r8, #12] + vld1.8 {q7}, [r8] + + bl _vpaes_preheat + rev r7, r7 @ The counter is big-endian. + +Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [r0]! @ Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [r1]! + subs r3, r3, #1 + @ Update the counter. + add r7, r7, #1 + rev r9, r7 + vmov.32 d15[1], r9 + bne Lctr32_loop + +Lctr32_done: + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S b/packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S new file mode 100644 index 0000000000..51ac249ef5 --- /dev/null +++ b/packager/third_party/boringssl/ios-arm/crypto/test/trampoline-armv4.S @@ -0,0 +1,377 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + + + + +.text + +@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| +@ with |argv|, then saves the callee-saved registers into |state|. It returns +@ the result of |func|. The |unwind| argument is unused. +@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, +@ const uint32_t *argv, size_t argc, +@ int unwind); + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +Labi_test_trampoline_begin: + @ Save parameters and all callee-saved registers. For convenience, we + @ save r9 on iOS even though it's volatile. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} + + @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 + @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). + sub sp, sp, #28 + + @ Every register in AAPCS is either non-volatile or a parameter (except + @ r9 on iOS), so this code, by the actual call, loses all its scratch + @ registers. First fill in stack parameters while there are registers + @ to spare. + cmp r3, #4 + bls Lstack_args_done + mov r4, sp @ r4 is the output pointer. + add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. + add r2, r2, #16 @ Skip four arguments. +Lstack_args_loop: + ldr r6, [r2], #4 + cmp r2, r5 + str r6, [r4], #4 + bne Lstack_args_loop + +Lstack_args_done: + @ Load registers from |r1|. + vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Load register parameters. This uses up our remaining registers, so we + @ repurpose lr as scratch space. + ldr r3, [sp, #40] @ Reload argc. + ldr lr, [sp, #36] @ Load argv into lr. + cmp r3, #3 + bhi Larg_r3 + beq Larg_r2 + cmp r3, #1 + bhi Larg_r1 + beq Larg_r0 + b Largs_done + +Larg_r3: + ldr r3, [lr, #12] @ argv[3] +Larg_r2: + ldr r2, [lr, #8] @ argv[2] +Larg_r1: + ldr r1, [lr, #4] @ argv[1] +Larg_r0: + ldr r0, [lr] @ argv[0] +Largs_done: + + @ With every other register in use, load the function pointer into lr + @ and call the function. + ldr lr, [sp, #28] + blx lr + + @ r1-r3 are free for use again. The trampoline only supports + @ single-return functions. Pass r4-r11 to the caller. + ldr r1, [sp, #32] + vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + stmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Unwind the stack and restore registers. + add sp, sp, #44 @ 44 = 28+16 + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + bx lr + + +.globl _abi_test_clobber_r0 +.private_extern _abi_test_clobber_r0 +.align 4 +_abi_test_clobber_r0: + mov r0, #0 + bx lr + + +.globl _abi_test_clobber_r1 +.private_extern _abi_test_clobber_r1 +.align 4 +_abi_test_clobber_r1: + mov r1, #0 + bx lr + + +.globl _abi_test_clobber_r2 +.private_extern _abi_test_clobber_r2 +.align 4 +_abi_test_clobber_r2: + mov r2, #0 + bx lr + + +.globl _abi_test_clobber_r3 +.private_extern _abi_test_clobber_r3 +.align 4 +_abi_test_clobber_r3: + mov r3, #0 + bx lr + + +.globl _abi_test_clobber_r4 +.private_extern _abi_test_clobber_r4 +.align 4 +_abi_test_clobber_r4: + mov r4, #0 + bx lr + + +.globl _abi_test_clobber_r5 +.private_extern _abi_test_clobber_r5 +.align 4 +_abi_test_clobber_r5: + mov r5, #0 + bx lr + + +.globl _abi_test_clobber_r6 +.private_extern _abi_test_clobber_r6 +.align 4 +_abi_test_clobber_r6: + mov r6, #0 + bx lr + + +.globl _abi_test_clobber_r7 +.private_extern _abi_test_clobber_r7 +.align 4 +_abi_test_clobber_r7: + mov r7, #0 + bx lr + + +.globl _abi_test_clobber_r8 +.private_extern _abi_test_clobber_r8 +.align 4 +_abi_test_clobber_r8: + mov r8, #0 + bx lr + + +.globl _abi_test_clobber_r9 +.private_extern _abi_test_clobber_r9 +.align 4 +_abi_test_clobber_r9: + mov r9, #0 + bx lr + + +.globl _abi_test_clobber_r10 +.private_extern _abi_test_clobber_r10 +.align 4 +_abi_test_clobber_r10: + mov r10, #0 + bx lr + + +.globl _abi_test_clobber_r11 +.private_extern _abi_test_clobber_r11 +.align 4 +_abi_test_clobber_r11: + mov r11, #0 + bx lr + + +.globl _abi_test_clobber_r12 +.private_extern _abi_test_clobber_r12 +.align 4 +_abi_test_clobber_r12: + mov r12, #0 + bx lr + + +.globl _abi_test_clobber_d0 +.private_extern _abi_test_clobber_d0 +.align 4 +_abi_test_clobber_d0: + mov r0, #0 + vmov s0, r0 + vmov s1, r0 + bx lr + + +.globl _abi_test_clobber_d1 +.private_extern _abi_test_clobber_d1 +.align 4 +_abi_test_clobber_d1: + mov r0, #0 + vmov s2, r0 + vmov s3, r0 + bx lr + + +.globl _abi_test_clobber_d2 +.private_extern _abi_test_clobber_d2 +.align 4 +_abi_test_clobber_d2: + mov r0, #0 + vmov s4, r0 + vmov s5, r0 + bx lr + + +.globl _abi_test_clobber_d3 +.private_extern _abi_test_clobber_d3 +.align 4 +_abi_test_clobber_d3: + mov r0, #0 + vmov s6, r0 + vmov s7, r0 + bx lr + + +.globl _abi_test_clobber_d4 +.private_extern _abi_test_clobber_d4 +.align 4 +_abi_test_clobber_d4: + mov r0, #0 + vmov s8, r0 + vmov s9, r0 + bx lr + + +.globl _abi_test_clobber_d5 +.private_extern _abi_test_clobber_d5 +.align 4 +_abi_test_clobber_d5: + mov r0, #0 + vmov s10, r0 + vmov s11, r0 + bx lr + + +.globl _abi_test_clobber_d6 +.private_extern _abi_test_clobber_d6 +.align 4 +_abi_test_clobber_d6: + mov r0, #0 + vmov s12, r0 + vmov s13, r0 + bx lr + + +.globl _abi_test_clobber_d7 +.private_extern _abi_test_clobber_d7 +.align 4 +_abi_test_clobber_d7: + mov r0, #0 + vmov s14, r0 + vmov s15, r0 + bx lr + + +.globl _abi_test_clobber_d8 +.private_extern _abi_test_clobber_d8 +.align 4 +_abi_test_clobber_d8: + mov r0, #0 + vmov s16, r0 + vmov s17, r0 + bx lr + + +.globl _abi_test_clobber_d9 +.private_extern _abi_test_clobber_d9 +.align 4 +_abi_test_clobber_d9: + mov r0, #0 + vmov s18, r0 + vmov s19, r0 + bx lr + + +.globl _abi_test_clobber_d10 +.private_extern _abi_test_clobber_d10 +.align 4 +_abi_test_clobber_d10: + mov r0, #0 + vmov s20, r0 + vmov s21, r0 + bx lr + + +.globl _abi_test_clobber_d11 +.private_extern _abi_test_clobber_d11 +.align 4 +_abi_test_clobber_d11: + mov r0, #0 + vmov s22, r0 + vmov s23, r0 + bx lr + + +.globl _abi_test_clobber_d12 +.private_extern _abi_test_clobber_d12 +.align 4 +_abi_test_clobber_d12: + mov r0, #0 + vmov s24, r0 + vmov s25, r0 + bx lr + + +.globl _abi_test_clobber_d13 +.private_extern _abi_test_clobber_d13 +.align 4 +_abi_test_clobber_d13: + mov r0, #0 + vmov s26, r0 + vmov s27, r0 + bx lr + + +.globl _abi_test_clobber_d14 +.private_extern _abi_test_clobber_d14 +.align 4 +_abi_test_clobber_d14: + mov r0, #0 + vmov s28, r0 + vmov s29, r0 + bx lr + + +.globl _abi_test_clobber_d15 +.private_extern _abi_test_clobber_d15 +.align 4 +_abi_test_clobber_d15: + mov r0, #0 + vmov s30, r0 + vmov s31, r0 + bx lr + +#endif // !OPENSSL_NO_ASM diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S index 6ff6bffb66..49449bf532 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S @@ -1,39 +1,48 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include -.text +.section .rodata .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 +.text + .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 .type ChaCha20_ctr32,%function .align 5 ChaCha20_ctr32: cbz x2,.Labort - adr x5,.LOPENSSL_armcap_P +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x5,:pg_hi21_nc:OPENSSL_armcap_P +#else + adrp x5,OPENSSL_armcap_P +#endif cmp x2,#192 b.lo .Lshort -#ifdef __ILP32__ - ldrsw x6,[x5] -#else - ldr x6,[x5] -#endif - ldr w17,[x6,x5] + ldr w17,[x5,:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON b.ne ChaCha20_neon @@ -41,7 +50,8 @@ ChaCha20_ctr32: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -314,7 +324,8 @@ ChaCha20_neon: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -807,7 +818,8 @@ ChaCha20_512_neon: stp x29,x30,[sp,#-96]! add x29,sp,#0 - adr x5,.Lsigma + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] @@ -1969,3 +1981,5 @@ ChaCha20_512_neon: ret .size ChaCha20_512_neon,.-ChaCha20_512_neon #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S index 51e2464487..60c70a24fd 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S @@ -1,17 +1,32 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include #if __ARM_MAX_ARCH__>=7 .text -#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) .arch armv8-a+crypto -#endif +.section .rodata .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b +.text + .globl aes_hw_set_encrypt_key .hidden aes_hw_set_encrypt_key .type aes_hw_set_encrypt_key,%function @@ -33,7 +48,8 @@ aes_hw_set_encrypt_key: tst w1,#0x3f b.ne .Lenc_key_abort - adr x3,.Lrcon + adrp x3,.Lrcon + add x3,x3,:lo12:.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b @@ -755,3 +771,5 @@ aes_hw_ctr32_encrypt_blocks: .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S index 74702db6aa..360bf4c7fe 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_mont @@ -1405,3 +1419,5 @@ __bn_mul4x_mont: .align 2 .align 4 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S new file mode 100644 index 0000000000..f876db3f89 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S @@ -0,0 +1,341 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne .Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S index 89d780ff69..37d97317aa 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S @@ -1,10 +1,22 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text -#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) .arch armv8-a+crypto -#endif .globl gcm_init_v8 .hidden gcm_init_v8 .type gcm_init_v8,%function @@ -108,13 +120,13 @@ gcm_ghash_v8: //loaded value would have //to be rotated in order to //make it appear as in - //alorithm specification + //algorithm specification subs x3,x3,#32 //see if x3 is 32 or larger mov x12,#16 //x12 is used as post- //increment for input pointer; //as loop is modulo-scheduled //x12 is zeroed just in time - //to preclude oversteping + //to preclude overstepping //inp[len], which means that //last block[s] are actually //loaded twice, but last @@ -233,3 +245,5 @@ gcm_ghash_v8: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S index ff361f454a..f681b9983f 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text @@ -9,14 +23,12 @@ .type sha1_block_data_order,%function .align 6 sha1_block_data_order: -#ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:OPENSSL_armcap_P #else - ldr x16,.LOPENSSL_armcap_P + adrp x16,OPENSSL_armcap_P #endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + ldr w16,[x16,:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry @@ -1082,7 +1094,8 @@ sha1_block_armv8: stp x29,x30,[sp,#-16]! add x29,sp,#0 - adr x4,.Lconst + adrp x4,.Lconst + add x4,x4,:lo12:.Lconst eor v1.16b,v1.16b,v1.16b ld1 {v0.4s},[x0],#16 ld1 {v1.s}[0],[x0] @@ -1205,20 +1218,18 @@ sha1_block_armv8: ldr x29,[sp],#16 ret .size sha1_block_armv8,.-sha1_block_armv8 +.section .rodata .align 6 .Lconst: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 .comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S index 19db33937e..6e09f69a94 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use @@ -51,14 +65,12 @@ .align 6 sha256_block_data_order: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] +#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 + adrp x16,:pg_hi21_nc:OPENSSL_armcap_P +#else + adrp x16,OPENSSL_armcap_P +#endif + ldr w16,[x16,:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry #endif @@ -77,7 +89,8 @@ sha256_block_data_order: ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] - adr x30,.LK256 + adrp x30,.LK256 + add x30,x30,:lo12:.LK256 stp x0,x2,[x29,#96] .Loop: @@ -1024,6 +1037,7 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-sha256_block_data_order +.section .rodata .align 6 .type .LK256,%object .LK256: @@ -1045,18 +1059,10 @@ sha256_block_data_order: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 +.text #ifndef __KERNEL__ .type sha256_block_armv8,%function .align 6 @@ -1066,7 +1072,8 @@ sha256_block_armv8: add x29,sp,#0 ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 + adrp x3,.LK256 + add x3,x3,:lo12:.LK256 .Loop_hw: ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 @@ -1199,5 +1206,8 @@ sha256_block_armv8: #endif #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S index bb052b7551..7b9b22a02a 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use @@ -65,7 +79,8 @@ sha512_block_data_order: ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] - adr x30,.LK512 + adrp x30,.LK512 + add x30,x30,:lo12:.LK512 stp x0,x2,[x29,#96] .Loop: @@ -1012,6 +1027,7 @@ sha512_block_data_order: ret .size sha512_block_data_order,.-sha512_block_data_order +.section .rodata .align 6 .type .LK512,%object .LK512: @@ -1057,19 +1073,13 @@ sha512_block_data_order: .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .size .LK512,.-.LK512 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S new file mode 100644 index 0000000000..f57b7b5174 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S @@ -0,0 +1,1216 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +.Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_encrypt,.-vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x + +.type _vpaes_decrypt_preheat,%function +.align 4 +_vpaes_decrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + adrp x11, .Lk_dipt + add x11, x11, :lo12:.Lk_dipt + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + ret +.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, .Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, .Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Ldec_entry + +.align 4 +.Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_decrypt,.-vpaes_decrypt + +// v14-v15 input, v0-v1 output +.type _vpaes_decrypt_2x,%function +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, .Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, .Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Ldec_2x_entry + +.align 4 +.Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret +.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, .Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, .Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 + adrp x11, .Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + cbnz w3, .Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +.Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov x0, #10 // mov $10, %esi + +.Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +.Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +.Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz w3, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, .Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key +.globl vpaes_cbc_encrypt +.hidden vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,%function +.align 4 +vpaes_cbc_encrypt: + cbz x2, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b .Lcbc_enc_loop + +.align 4 +.Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi .Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +.Lcbc_abort: + ret +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + +.type vpaes_cbc_decrypt,%function +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls .Lcbc_dec_done + +.align 4 +.Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi .Lcbc_dec_loop2x + +.Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, .Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // .Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S new file mode 100644 index 0000000000..9a21cc2c6e --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S @@ -0,0 +1,688 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); +.type abi_test_trampoline, %function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 4 +abi_test_trampoline: +.Labi_test_trampoline_begin: + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, .Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x7, [x10], #8 + +.Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq .Lx29_ok + str xzr, [x1] + +.Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + ret +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_x0, %function +.globl abi_test_clobber_x0 +.hidden abi_test_clobber_x0 +.align 4 +abi_test_clobber_x0: + mov x0, xzr + ret +.size abi_test_clobber_x0,.-abi_test_clobber_x0 +.type abi_test_clobber_x1, %function +.globl abi_test_clobber_x1 +.hidden abi_test_clobber_x1 +.align 4 +abi_test_clobber_x1: + mov x1, xzr + ret +.size abi_test_clobber_x1,.-abi_test_clobber_x1 +.type abi_test_clobber_x2, %function +.globl abi_test_clobber_x2 +.hidden abi_test_clobber_x2 +.align 4 +abi_test_clobber_x2: + mov x2, xzr + ret +.size abi_test_clobber_x2,.-abi_test_clobber_x2 +.type abi_test_clobber_x3, %function +.globl abi_test_clobber_x3 +.hidden abi_test_clobber_x3 +.align 4 +abi_test_clobber_x3: + mov x3, xzr + ret +.size abi_test_clobber_x3,.-abi_test_clobber_x3 +.type abi_test_clobber_x4, %function +.globl abi_test_clobber_x4 +.hidden abi_test_clobber_x4 +.align 4 +abi_test_clobber_x4: + mov x4, xzr + ret +.size abi_test_clobber_x4,.-abi_test_clobber_x4 +.type abi_test_clobber_x5, %function +.globl abi_test_clobber_x5 +.hidden abi_test_clobber_x5 +.align 4 +abi_test_clobber_x5: + mov x5, xzr + ret +.size abi_test_clobber_x5,.-abi_test_clobber_x5 +.type abi_test_clobber_x6, %function +.globl abi_test_clobber_x6 +.hidden abi_test_clobber_x6 +.align 4 +abi_test_clobber_x6: + mov x6, xzr + ret +.size abi_test_clobber_x6,.-abi_test_clobber_x6 +.type abi_test_clobber_x7, %function +.globl abi_test_clobber_x7 +.hidden abi_test_clobber_x7 +.align 4 +abi_test_clobber_x7: + mov x7, xzr + ret +.size abi_test_clobber_x7,.-abi_test_clobber_x7 +.type abi_test_clobber_x8, %function +.globl abi_test_clobber_x8 +.hidden abi_test_clobber_x8 +.align 4 +abi_test_clobber_x8: + mov x8, xzr + ret +.size abi_test_clobber_x8,.-abi_test_clobber_x8 +.type abi_test_clobber_x9, %function +.globl abi_test_clobber_x9 +.hidden abi_test_clobber_x9 +.align 4 +abi_test_clobber_x9: + mov x9, xzr + ret +.size abi_test_clobber_x9,.-abi_test_clobber_x9 +.type abi_test_clobber_x10, %function +.globl abi_test_clobber_x10 +.hidden abi_test_clobber_x10 +.align 4 +abi_test_clobber_x10: + mov x10, xzr + ret +.size abi_test_clobber_x10,.-abi_test_clobber_x10 +.type abi_test_clobber_x11, %function +.globl abi_test_clobber_x11 +.hidden abi_test_clobber_x11 +.align 4 +abi_test_clobber_x11: + mov x11, xzr + ret +.size abi_test_clobber_x11,.-abi_test_clobber_x11 +.type abi_test_clobber_x12, %function +.globl abi_test_clobber_x12 +.hidden abi_test_clobber_x12 +.align 4 +abi_test_clobber_x12: + mov x12, xzr + ret +.size abi_test_clobber_x12,.-abi_test_clobber_x12 +.type abi_test_clobber_x13, %function +.globl abi_test_clobber_x13 +.hidden abi_test_clobber_x13 +.align 4 +abi_test_clobber_x13: + mov x13, xzr + ret +.size abi_test_clobber_x13,.-abi_test_clobber_x13 +.type abi_test_clobber_x14, %function +.globl abi_test_clobber_x14 +.hidden abi_test_clobber_x14 +.align 4 +abi_test_clobber_x14: + mov x14, xzr + ret +.size abi_test_clobber_x14,.-abi_test_clobber_x14 +.type abi_test_clobber_x15, %function +.globl abi_test_clobber_x15 +.hidden abi_test_clobber_x15 +.align 4 +abi_test_clobber_x15: + mov x15, xzr + ret +.size abi_test_clobber_x15,.-abi_test_clobber_x15 +.type abi_test_clobber_x16, %function +.globl abi_test_clobber_x16 +.hidden abi_test_clobber_x16 +.align 4 +abi_test_clobber_x16: + mov x16, xzr + ret +.size abi_test_clobber_x16,.-abi_test_clobber_x16 +.type abi_test_clobber_x17, %function +.globl abi_test_clobber_x17 +.hidden abi_test_clobber_x17 +.align 4 +abi_test_clobber_x17: + mov x17, xzr + ret +.size abi_test_clobber_x17,.-abi_test_clobber_x17 +.type abi_test_clobber_x19, %function +.globl abi_test_clobber_x19 +.hidden abi_test_clobber_x19 +.align 4 +abi_test_clobber_x19: + mov x19, xzr + ret +.size abi_test_clobber_x19,.-abi_test_clobber_x19 +.type abi_test_clobber_x20, %function +.globl abi_test_clobber_x20 +.hidden abi_test_clobber_x20 +.align 4 +abi_test_clobber_x20: + mov x20, xzr + ret +.size abi_test_clobber_x20,.-abi_test_clobber_x20 +.type abi_test_clobber_x21, %function +.globl abi_test_clobber_x21 +.hidden abi_test_clobber_x21 +.align 4 +abi_test_clobber_x21: + mov x21, xzr + ret +.size abi_test_clobber_x21,.-abi_test_clobber_x21 +.type abi_test_clobber_x22, %function +.globl abi_test_clobber_x22 +.hidden abi_test_clobber_x22 +.align 4 +abi_test_clobber_x22: + mov x22, xzr + ret +.size abi_test_clobber_x22,.-abi_test_clobber_x22 +.type abi_test_clobber_x23, %function +.globl abi_test_clobber_x23 +.hidden abi_test_clobber_x23 +.align 4 +abi_test_clobber_x23: + mov x23, xzr + ret +.size abi_test_clobber_x23,.-abi_test_clobber_x23 +.type abi_test_clobber_x24, %function +.globl abi_test_clobber_x24 +.hidden abi_test_clobber_x24 +.align 4 +abi_test_clobber_x24: + mov x24, xzr + ret +.size abi_test_clobber_x24,.-abi_test_clobber_x24 +.type abi_test_clobber_x25, %function +.globl abi_test_clobber_x25 +.hidden abi_test_clobber_x25 +.align 4 +abi_test_clobber_x25: + mov x25, xzr + ret +.size abi_test_clobber_x25,.-abi_test_clobber_x25 +.type abi_test_clobber_x26, %function +.globl abi_test_clobber_x26 +.hidden abi_test_clobber_x26 +.align 4 +abi_test_clobber_x26: + mov x26, xzr + ret +.size abi_test_clobber_x26,.-abi_test_clobber_x26 +.type abi_test_clobber_x27, %function +.globl abi_test_clobber_x27 +.hidden abi_test_clobber_x27 +.align 4 +abi_test_clobber_x27: + mov x27, xzr + ret +.size abi_test_clobber_x27,.-abi_test_clobber_x27 +.type abi_test_clobber_x28, %function +.globl abi_test_clobber_x28 +.hidden abi_test_clobber_x28 +.align 4 +abi_test_clobber_x28: + mov x28, xzr + ret +.size abi_test_clobber_x28,.-abi_test_clobber_x28 +.type abi_test_clobber_x29, %function +.globl abi_test_clobber_x29 +.hidden abi_test_clobber_x29 +.align 4 +abi_test_clobber_x29: + mov x29, xzr + ret +.size abi_test_clobber_x29,.-abi_test_clobber_x29 +.type abi_test_clobber_d0, %function +.globl abi_test_clobber_d0 +.hidden abi_test_clobber_d0 +.align 4 +abi_test_clobber_d0: + fmov d0, xzr + ret +.size abi_test_clobber_d0,.-abi_test_clobber_d0 +.type abi_test_clobber_d1, %function +.globl abi_test_clobber_d1 +.hidden abi_test_clobber_d1 +.align 4 +abi_test_clobber_d1: + fmov d1, xzr + ret +.size abi_test_clobber_d1,.-abi_test_clobber_d1 +.type abi_test_clobber_d2, %function +.globl abi_test_clobber_d2 +.hidden abi_test_clobber_d2 +.align 4 +abi_test_clobber_d2: + fmov d2, xzr + ret +.size abi_test_clobber_d2,.-abi_test_clobber_d2 +.type abi_test_clobber_d3, %function +.globl abi_test_clobber_d3 +.hidden abi_test_clobber_d3 +.align 4 +abi_test_clobber_d3: + fmov d3, xzr + ret +.size abi_test_clobber_d3,.-abi_test_clobber_d3 +.type abi_test_clobber_d4, %function +.globl abi_test_clobber_d4 +.hidden abi_test_clobber_d4 +.align 4 +abi_test_clobber_d4: + fmov d4, xzr + ret +.size abi_test_clobber_d4,.-abi_test_clobber_d4 +.type abi_test_clobber_d5, %function +.globl abi_test_clobber_d5 +.hidden abi_test_clobber_d5 +.align 4 +abi_test_clobber_d5: + fmov d5, xzr + ret +.size abi_test_clobber_d5,.-abi_test_clobber_d5 +.type abi_test_clobber_d6, %function +.globl abi_test_clobber_d6 +.hidden abi_test_clobber_d6 +.align 4 +abi_test_clobber_d6: + fmov d6, xzr + ret +.size abi_test_clobber_d6,.-abi_test_clobber_d6 +.type abi_test_clobber_d7, %function +.globl abi_test_clobber_d7 +.hidden abi_test_clobber_d7 +.align 4 +abi_test_clobber_d7: + fmov d7, xzr + ret +.size abi_test_clobber_d7,.-abi_test_clobber_d7 +.type abi_test_clobber_d8, %function +.globl abi_test_clobber_d8 +.hidden abi_test_clobber_d8 +.align 4 +abi_test_clobber_d8: + fmov d8, xzr + ret +.size abi_test_clobber_d8,.-abi_test_clobber_d8 +.type abi_test_clobber_d9, %function +.globl abi_test_clobber_d9 +.hidden abi_test_clobber_d9 +.align 4 +abi_test_clobber_d9: + fmov d9, xzr + ret +.size abi_test_clobber_d9,.-abi_test_clobber_d9 +.type abi_test_clobber_d10, %function +.globl abi_test_clobber_d10 +.hidden abi_test_clobber_d10 +.align 4 +abi_test_clobber_d10: + fmov d10, xzr + ret +.size abi_test_clobber_d10,.-abi_test_clobber_d10 +.type abi_test_clobber_d11, %function +.globl abi_test_clobber_d11 +.hidden abi_test_clobber_d11 +.align 4 +abi_test_clobber_d11: + fmov d11, xzr + ret +.size abi_test_clobber_d11,.-abi_test_clobber_d11 +.type abi_test_clobber_d12, %function +.globl abi_test_clobber_d12 +.hidden abi_test_clobber_d12 +.align 4 +abi_test_clobber_d12: + fmov d12, xzr + ret +.size abi_test_clobber_d12,.-abi_test_clobber_d12 +.type abi_test_clobber_d13, %function +.globl abi_test_clobber_d13 +.hidden abi_test_clobber_d13 +.align 4 +abi_test_clobber_d13: + fmov d13, xzr + ret +.size abi_test_clobber_d13,.-abi_test_clobber_d13 +.type abi_test_clobber_d14, %function +.globl abi_test_clobber_d14 +.hidden abi_test_clobber_d14 +.align 4 +abi_test_clobber_d14: + fmov d14, xzr + ret +.size abi_test_clobber_d14,.-abi_test_clobber_d14 +.type abi_test_clobber_d15, %function +.globl abi_test_clobber_d15 +.hidden abi_test_clobber_d15 +.align 4 +abi_test_clobber_d15: + fmov d15, xzr + ret +.size abi_test_clobber_d15,.-abi_test_clobber_d15 +.type abi_test_clobber_d16, %function +.globl abi_test_clobber_d16 +.hidden abi_test_clobber_d16 +.align 4 +abi_test_clobber_d16: + fmov d16, xzr + ret +.size abi_test_clobber_d16,.-abi_test_clobber_d16 +.type abi_test_clobber_d17, %function +.globl abi_test_clobber_d17 +.hidden abi_test_clobber_d17 +.align 4 +abi_test_clobber_d17: + fmov d17, xzr + ret +.size abi_test_clobber_d17,.-abi_test_clobber_d17 +.type abi_test_clobber_d18, %function +.globl abi_test_clobber_d18 +.hidden abi_test_clobber_d18 +.align 4 +abi_test_clobber_d18: + fmov d18, xzr + ret +.size abi_test_clobber_d18,.-abi_test_clobber_d18 +.type abi_test_clobber_d19, %function +.globl abi_test_clobber_d19 +.hidden abi_test_clobber_d19 +.align 4 +abi_test_clobber_d19: + fmov d19, xzr + ret +.size abi_test_clobber_d19,.-abi_test_clobber_d19 +.type abi_test_clobber_d20, %function +.globl abi_test_clobber_d20 +.hidden abi_test_clobber_d20 +.align 4 +abi_test_clobber_d20: + fmov d20, xzr + ret +.size abi_test_clobber_d20,.-abi_test_clobber_d20 +.type abi_test_clobber_d21, %function +.globl abi_test_clobber_d21 +.hidden abi_test_clobber_d21 +.align 4 +abi_test_clobber_d21: + fmov d21, xzr + ret +.size abi_test_clobber_d21,.-abi_test_clobber_d21 +.type abi_test_clobber_d22, %function +.globl abi_test_clobber_d22 +.hidden abi_test_clobber_d22 +.align 4 +abi_test_clobber_d22: + fmov d22, xzr + ret +.size abi_test_clobber_d22,.-abi_test_clobber_d22 +.type abi_test_clobber_d23, %function +.globl abi_test_clobber_d23 +.hidden abi_test_clobber_d23 +.align 4 +abi_test_clobber_d23: + fmov d23, xzr + ret +.size abi_test_clobber_d23,.-abi_test_clobber_d23 +.type abi_test_clobber_d24, %function +.globl abi_test_clobber_d24 +.hidden abi_test_clobber_d24 +.align 4 +abi_test_clobber_d24: + fmov d24, xzr + ret +.size abi_test_clobber_d24,.-abi_test_clobber_d24 +.type abi_test_clobber_d25, %function +.globl abi_test_clobber_d25 +.hidden abi_test_clobber_d25 +.align 4 +abi_test_clobber_d25: + fmov d25, xzr + ret +.size abi_test_clobber_d25,.-abi_test_clobber_d25 +.type abi_test_clobber_d26, %function +.globl abi_test_clobber_d26 +.hidden abi_test_clobber_d26 +.align 4 +abi_test_clobber_d26: + fmov d26, xzr + ret +.size abi_test_clobber_d26,.-abi_test_clobber_d26 +.type abi_test_clobber_d27, %function +.globl abi_test_clobber_d27 +.hidden abi_test_clobber_d27 +.align 4 +abi_test_clobber_d27: + fmov d27, xzr + ret +.size abi_test_clobber_d27,.-abi_test_clobber_d27 +.type abi_test_clobber_d28, %function +.globl abi_test_clobber_d28 +.hidden abi_test_clobber_d28 +.align 4 +abi_test_clobber_d28: + fmov d28, xzr + ret +.size abi_test_clobber_d28,.-abi_test_clobber_d28 +.type abi_test_clobber_d29, %function +.globl abi_test_clobber_d29 +.hidden abi_test_clobber_d29 +.align 4 +abi_test_clobber_d29: + fmov d29, xzr + ret +.size abi_test_clobber_d29,.-abi_test_clobber_d29 +.type abi_test_clobber_d30, %function +.globl abi_test_clobber_d30 +.hidden abi_test_clobber_d30 +.align 4 +abi_test_clobber_d30: + fmov d30, xzr + ret +.size abi_test_clobber_d30,.-abi_test_clobber_d30 +.type abi_test_clobber_d31, %function +.globl abi_test_clobber_d31 +.hidden abi_test_clobber_d31 +.align 4 +abi_test_clobber_d31: + fmov d31, xzr + ret +.size abi_test_clobber_d31,.-abi_test_clobber_d31 +.type abi_test_clobber_v8_upper, %function +.globl abi_test_clobber_v8_upper +.hidden abi_test_clobber_v8_upper +.align 4 +abi_test_clobber_v8_upper: + fmov v8.d[1], xzr + ret +.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper +.type abi_test_clobber_v9_upper, %function +.globl abi_test_clobber_v9_upper +.hidden abi_test_clobber_v9_upper +.align 4 +abi_test_clobber_v9_upper: + fmov v9.d[1], xzr + ret +.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper +.type abi_test_clobber_v10_upper, %function +.globl abi_test_clobber_v10_upper +.hidden abi_test_clobber_v10_upper +.align 4 +abi_test_clobber_v10_upper: + fmov v10.d[1], xzr + ret +.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper +.type abi_test_clobber_v11_upper, %function +.globl abi_test_clobber_v11_upper +.hidden abi_test_clobber_v11_upper +.align 4 +abi_test_clobber_v11_upper: + fmov v11.d[1], xzr + ret +.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper +.type abi_test_clobber_v12_upper, %function +.globl abi_test_clobber_v12_upper +.hidden abi_test_clobber_v12_upper +.align 4 +abi_test_clobber_v12_upper: + fmov v12.d[1], xzr + ret +.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper +.type abi_test_clobber_v13_upper, %function +.globl abi_test_clobber_v13_upper +.hidden abi_test_clobber_v13_upper +.align 4 +abi_test_clobber_v13_upper: + fmov v13.d[1], xzr + ret +.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper +.type abi_test_clobber_v14_upper, %function +.globl abi_test_clobber_v14_upper +.hidden abi_test_clobber_v14_upper +.align 4 +abi_test_clobber_v14_upper: + fmov v14.d[1], xzr + ret +.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper +.type abi_test_clobber_v15_upper, %function +.globl abi_test_clobber_v15_upper +.hidden abi_test_clobber_v15_upper +.align 4 +abi_test_clobber_v15_upper: + fmov v15.d[1], xzr + ret +.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S new file mode 100644 index 0000000000..63e8d1c381 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/third_party/sike/asm/fp-armv8.S @@ -0,0 +1,999 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.section .rodata + +# p434 x 2 +.Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 + +# p434 + 1 +.Lp434p1: +.quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056, 0x0002341F27177344 + +.text +.globl sike_mpmul +.hidden sike_mpmul +.align 4 +sike_mpmul: + stp x29, x30, [sp,#-96]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + ldp x14, x15, [x1,#32] + ldr x16, [x1,#48] + + // x3-x7 <- AH + AL, x7 <- carry + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, xzr + adc x7, xzr, xzr + + // x10-x13 <- BH + BL, x8 <- carry + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, xzr + adc x8, xzr, xzr + + // x9 <- combined carry + and x9, x7, x8 + // x7-x8 <- mask + sub x7, xzr, x7 + sub x8, xzr, x8 + + // x15-x19 <- masked (BH + BL) + and x14, x10, x7 + and x15, x11, x7 + and x16, x12, x7 + and x17, x13, x7 + + // x20-x23 <- masked (AH + AL) + and x20, x3, x8 + and x21, x4, x8 + and x22, x5, x8 + and x23, x6, x8 + + // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x14, x14, x20 + adcs x15, x15, x21 + adcs x16, x16, x22 + adcs x17, x17, x23 + adc x7, x9, xzr + + // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part + stp x3, x4, [x2,#0] + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x25, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x23, x10, x12 + adcs x26, x11, x13 + adc x24, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x19, xzr, x25 + sub x20, xzr, x24 + and x8, x23, x19 + and x9, x26, x19 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x21, x3, x20 + and x22, x4, x20 + mul x19, x3, x23 + mul x20, x3, x26 + and x25, x25, x24 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x8, x21, x8 + umulh x21, x3, x26 + adcs x9, x22, x9 + umulh x22, x3, x23 + adc x25, x25, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x23 + umulh x23, x4, x23 + adds x20, x20, x22 + adc x21, x21, xzr + + mul x24, x4, x26 + umulh x26, x4, x26 + adds x20, x20, x3 + adcs x21, x21, x23 + adc x22, xzr, xzr + + adds x21, x21, x24 + adc x22, x22, x26 + + ldp x3, x4, [x2,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x21, x8, x21 + umulh x24, x3, x10 + umulh x26, x3, x11 + adcs x22, x9, x22 + mul x8, x3, x10 + mul x9, x3, x11 + adc x25, x25, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x9, x9, x24 + adc x26, x26, xzr + + mul x23, x4, x11 + umulh x11, x4, x11 + adds x9, x9, x3 + adcs x26, x26, x10 + adc x24, xzr, xzr + + adds x26, x26, x23 + adc x24, x24, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x19, x19, x8 + sbcs x20, x20, x9 + sbcs x21, x21, x26 + mul x4, x5, x13 + umulh x23, x5, x13 + sbcs x22, x22, x24 + sbc x25, x25, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x23, x23, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x23, x23, x12 + adc x10, xzr, xzr + + adds x23, x23, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x19, x19, x3 + sbcs x20, x20, x4 + sbcs x21, x21, x23 + sbcs x22, x22, x10 + sbc x25, x25, xzr + + adds x19, x19, x26 + adcs x20, x20, x24 + adcs x21, x21, x3 + adcs x22, x22, x4 + adcs x23, x25, x23 + adc x24, x10, xzr + + + // x15-x19, x7 <- (AH+AL) x (BH+BL), final step + adds x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adc x7, x7, xzr + + // Load AL + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + // Load BL + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + + // Temporarily store x8 in x2 + stp x8, x9, [x2,#0] + // x21-x28 <- AL x BL + // A0-A1 <- AH + AL, T0 <- mask + adds x3, x3, x5 + adcs x4, x4, x6 + adc x8, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds x27, x10, x12 + adcs x9, x11, x13 + adc x28, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub x23, xzr, x8 + sub x24, xzr, x28 + and x21, x27, x23 + and x22, x9, x23 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and x25, x3, x24 + and x26, x4, x24 + mul x23, x3, x27 + mul x24, x3, x9 + and x8, x8, x28 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds x21, x25, x21 + umulh x25, x3, x9 + adcs x22, x26, x22 + umulh x26, x3, x27 + adc x8, x8, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + mul x3, x4, x27 + umulh x27, x4, x27 + adds x24, x24, x26 + adc x25, x25, xzr + + mul x28, x4, x9 + umulh x9, x4, x9 + adds x24, x24, x3 + adcs x25, x25, x27 + adc x26, xzr, xzr + + adds x25, x25, x28 + adc x26, x26, x9 + + ldp x3, x4, [x0,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds x25, x21, x25 + umulh x28, x3, x10 + umulh x9, x3, x11 + adcs x26, x22, x26 + mul x21, x3, x10 + mul x22, x3, x11 + adc x8, x8, xzr + + // C0-C1, T1, C7 <- AL x BL + mul x3, x4, x10 + umulh x10, x4, x10 + adds x22, x22, x28 + adc x9, x9, xzr + + mul x27, x4, x11 + umulh x11, x4, x11 + adds x22, x22, x3 + adcs x9, x9, x10 + adc x28, xzr, xzr + + adds x9, x9, x27 + adc x28, x28, x11 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul x3, x5, x12 + umulh x10, x5, x12 + subs x23, x23, x21 + sbcs x24, x24, x22 + sbcs x25, x25, x9 + mul x4, x5, x13 + umulh x27, x5, x13 + sbcs x26, x26, x28 + sbc x8, x8, xzr + + // A0, A1, C6, B0 <- AH x BH + mul x5, x6, x12 + umulh x12, x6, x12 + adds x4, x4, x10 + adc x27, x27, xzr + + mul x11, x6, x13 + umulh x13, x6, x13 + adds x4, x4, x5 + adcs x27, x27, x12 + adc x10, xzr, xzr + + adds x27, x27, x11 + adc x10, x10, x13 + + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x23, x23, x3 + sbcs x24, x24, x4 + sbcs x25, x25, x27 + sbcs x26, x26, x10 + sbc x8, x8, xzr + + adds x23, x23, x9 + adcs x24, x24, x28 + adcs x25, x25, x3 + adcs x26, x26, x4 + adcs x27, x8, x27 + adc x28, x10, xzr + + // Restore x8 + ldp x8, x9, [x2,#0] + + // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + sbc x7, x7, xzr + + // Store ALxBL, low + stp x21, x22, [x2] + stp x23, x24, [x2,#16] + + // Load AH + ldp x3, x4, [x0,#32] + ldr x5, [x0,#48] + // Load BH + ldp x10, x11, [x1,#32] + ldr x12, [x1,#48] + + adds x8, x8, x25 + adcs x9, x9, x26 + adcs x19, x19, x27 + adcs x20, x20, x28 + adc x1, xzr, xzr + + add x0, x0, #32 + // Temporarily store x8,x9 in x2 + stp x8,x9, [x2,#32] + // x21-x28 <- AH x BH + + // A0 * B0 + mul x21, x3, x10 // C0 + umulh x24, x3, x10 + + // A0 * B1 + mul x22, x3, x11 + umulh x23, x3, x11 + + // A1 * B0 + mul x8, x4, x10 + umulh x9, x4, x10 + adds x22, x22, x24 + adc x23, x23, xzr + + // A0 * B2 + mul x27, x3, x12 + umulh x28, x3, x12 + adds x22, x22, x8 // C1 + adcs x23, x23, x9 + adc x24, xzr, xzr + + // A2 * B0 + mul x8, x5, x10 + umulh x25, x5, x10 + adds x23, x23, x27 + adcs x24, x24, x25 + adc x25, xzr, xzr + + // A1 * B1 + mul x27, x4, x11 + umulh x9, x4, x11 + adds x23, x23, x8 + adcs x24, x24, x28 + adc x25, x25, xzr + + // A1 * B2 + mul x8, x4, x12 + umulh x28, x4, x12 + adds x23, x23, x27 // C2 + adcs x24, x24, x9 + adc x25, x25, xzr + + // A2 * B1 + mul x27, x5, x11 + umulh x9, x5, x11 + adds x24, x24, x8 + adcs x25, x25, x28 + adc x26, xzr, xzr + + // A2 * B2 + mul x8, x5, x12 + umulh x28, x5, x12 + adds x24, x24, x27 // C3 + adcs x25, x25, x9 + adc x26, x26, xzr + + adds x25, x25, x8 // C4 + adc x26, x26, x28 // C5 + + // Restore x8,x9 + ldp x8,x9, [x2,#32] + + neg x1, x1 + + // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, xzr + sbcs x17, x17, xzr + sbc x7, x7, xzr + + // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low + stp x8, x9, [x2,#32] + stp x19, x20, [x2,#48] + + adds x1, x1, #1 + adcs x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adcs x25, x7, x25 + adc x26, x26, xzr + + stp x14, x15, [x2,#64] + stp x16, x17, [x2,#80] + stp x25, x26, [x2,#96] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl sike_fprdc +.hidden sike_fprdc +.align 4 +sike_fprdc: + stp x29, x30, [sp, #-96]! + add x29, sp, xzr + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + adrp x26, .Lp434p1 + add x26, x26, :lo12:.Lp434p1 + ldp x23, x24, [x26, #0x0] + ldp x25, x26, [x26,#0x10] + + // a[0-1] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x10, x3, x23 + umulh x11, x3, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x11 + adc x7, xzr, xzr + + mul x10, x3, x24 + umulh x11, x3, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x11 + adc x8, x8, xzr + + mul x10, x3, x25 + umulh x11, x3, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x3, x26 + umulh x28, x3, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x11 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + ldp x10, x11, [x0, #0x18] + ldp x12, x13, [x0, #0x28] + ldp x14, x15, [x0, #0x38] + ldp x16, x17, [x0, #0x48] + ldp x19, x20, [x0, #0x58] + ldr x21, [x0, #0x68] + + adds x10, x10, x4 + adcs x11, x11, x5 + adcs x12, x12, x6 + adcs x13, x13, x7 + adcs x14, x14, x8 + adcs x15, x15, x9 + adcs x22, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + ldr x2, [x0,#0x10] // a[2] + // a[2-3] * p434+1 + mul x4, x2, x23 // C0 + umulh x7, x2, x23 + + mul x5, x2, x24 + umulh x6, x2, x24 + + mul x0, x10, x23 + umulh x3, x10, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x2, x25 + umulh x28, x2, x25 + adds x5, x5, x0 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x0, x10, x24 + umulh x3, x10, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x2, x26 + umulh x28, x2, x26 + adds x6, x6, x0 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x0, x10, x25 + umulh x3, x10, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x10, x26 + umulh x28, x10, x26 + adds x7, x7, x0 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + + adds x12, x12, x4 + adcs x13, x13, x5 + adcs x14, x14, x6 + adcs x15, x15, x7 + adcs x16, x22, x8 + adcs x17, x17, x9 + adcs x22, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + mul x4, x11, x23 // C0 + umulh x7, x11, x23 + + mul x5, x11, x24 + umulh x6, x11, x24 + + mul x10, x12, x23 + umulh x3, x12, x23 + adds x5, x5, x7 + adc x6, x6, xzr + + mul x27, x11, x25 + umulh x28, x11, x25 + adds x5, x5, x10 // C1 + adcs x6, x6, x3 + adc x7, xzr, xzr + + mul x10, x12, x24 + umulh x3, x12, x24 + adds x6, x6, x27 + adcs x7, x7, x28 + adc x8, xzr, xzr + + mul x27, x11, x26 + umulh x28, x11, x26 + adds x6, x6, x10 // C2 + adcs x7, x7, x3 + adc x8, x8, xzr + + mul x10, x12, x25 + umulh x3, x12, x25 + adds x7, x7, x27 + adcs x8, x8, x28 + adc x9, xzr, xzr + + mul x27, x12, x26 + umulh x28, x12, x26 + adds x7, x7, x10 // C3 + adcs x8, x8, x3 + adc x9, x9, xzr + adds x8, x8, x27 // C4 + adc x9, x9, x28 // C5 + + + adds x14, x14, x4 + adcs x15, x15, x5 + adcs x16, x16, x6 + adcs x17, x17, x7 + adcs x19, x22, x8 + adcs x20, x20, x9 + adc x22, x21, xzr + + stp x14, x15, [x1, #0x0] // C0, C1 + + mul x4, x13, x23 // C0 + umulh x10, x13, x23 + + mul x5, x13, x24 + umulh x27, x13, x24 + adds x5, x5, x10 // C1 + adc x10, xzr, xzr + + mul x6, x13, x25 + umulh x28, x13, x25 + adds x27, x10, x27 + adcs x6, x6, x27 // C2 + adc x10, xzr, xzr + + mul x7, x13, x26 + umulh x8, x13, x26 + adds x28, x10, x28 + adcs x7, x7, x28 // C3 + adc x8, x8, xzr // C4 + + adds x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + adcs x20, x20, x7 + adc x21, x22, x8 + + str x16, [x1, #0x10] + stp x17, x19, [x1, #0x18] + stp x20, x21, [x1, #0x28] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +.globl sike_fpadd +.hidden sike_fpadd +.align 4 +sike_fpadd: + stp x29,x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + // Subtract 2xp434 + adrp x17, .Lp434x2 + add x17, x17, :lo12:.Lp434x2 + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbc x0, xzr, xzr // x0 can be reused now + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl sike_fpsub +.hidden sike_fpsub +.align 4 +sike_fpsub: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbc x0, xzr, xzr + + // Add 2xp434 anded with the mask in x0 + adrp x17, .Lp434x2 + add x17, x17, :lo12:.Lp434x2 + + // First half + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl sike_mpadd_asm +.hidden sike_mpadd_asm +.align 4 +sike_mpadd_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +.globl sike_mpsubx2_asm +.hidden sike_mpsubx2_asm +.align 4 +sike_mpsubx2_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + sbcs x10, x10, x14 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x11, x12, [x1,#96] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +.globl sike_mpdblsubx2_asm +.hidden sike_mpdblsubx2_asm +.align 4 +sike_mpdblsubx2_asm: + stp x29, x30, [sp, #-16]! + add x29, sp, #0 + + ldp x3, x4, [x2, #0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + + ldp x11, x12, [x0, #0] + ldp x13, x14, [x0,#16] + ldp x15, x16, [x0,#32] + + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + + // x9 stores carry + adc x9, xzr, xzr + + ldp x11, x12, [x1, #0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2, #0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + + ldp x3, x4, [x2,#48] + ldp x5, x6, [x2,#64] + ldp x7, x8, [x2,#80] + + ldp x11, x12, [x0,#48] + ldp x13, x14, [x0,#64] + ldp x15, x16, [x0,#80] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, xzr, xzr + + ldp x11, x12, [x1,#48] + ldp x13, x14, [x1,#64] + ldp x15, x16, [x1,#80] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2,#48] + stp x5, x6, [x2,#64] + stp x7, x8, [x2,#80] + + ldp x3, x4, [x2,#96] + ldp x11, x12, [x0,#96] + ldp x13, x14, [x1,#96] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + subs x3, x3, x13 + sbc x4, x4, x14 + stp x3, x4, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S index 6c947734fe..363aeee5f5 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S @@ -1,6 +1,24 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + .text #if defined(__thumb2__) || defined(__clang__) .syntax unified @@ -1471,3 +1489,5 @@ ChaCha20_neon: .comm OPENSSL_armcap_P,4,4 #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S index d401fc78f1..cfe2a36649 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -45,6 +59,11 @@ # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + .text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified @@ -160,23 +179,23 @@ AES_Te: .word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 .size AES_Te,.-AES_Te -@ void asm_AES_encrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,%function +@ void aes_nohw_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,%function .align 5 -asm_AES_encrypt: +aes_nohw_encrypt: #ifndef __thumb2__ - sub r3,pc,#8 @ asm_AES_encrypt + sub r3,pc,#8 @ aes_nohw_encrypt #else adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r10,AES_Te #else - sub r10,r3,#asm_AES_encrypt-AES_Te @ Te + sub r10,r3,#aes_nohw_encrypt-AES_Te @ Te #endif mov r12,r0 @ inp mov r11,r2 @@ -273,7 +292,7 @@ asm_AES_encrypt: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size asm_AES_encrypt,.-asm_AES_encrypt +.size aes_nohw_encrypt,.-aes_nohw_encrypt .type _armv4_AES_encrypt,%function .align 2 @@ -412,14 +431,14 @@ _armv4_AES_encrypt: ldr pc,[sp],#4 @ pop and return .size _armv4_AES_encrypt,.-_armv4_AES_encrypt -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,%function +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,%function .align 5 -asm_AES_set_encrypt_key: +aes_nohw_set_encrypt_key: _armv4_AES_set_encrypt_key: #ifndef __thumb2__ - sub r3,pc,#8 @ asm_AES_set_encrypt_key + sub r3,pc,#8 @ aes_nohw_set_encrypt_key #else adr r3,. #endif @@ -452,7 +471,7 @@ _armv4_AES_set_encrypt_key: mov lr,r1 @ bits mov r11,r2 @ key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r10,AES_Te+1024 @ Te4 #else sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 @@ -717,23 +736,23 @@ _armv4_AES_set_encrypt_key: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key +.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,%function +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,%function .align 5 -asm_AES_set_decrypt_key: +aes_nohw_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 ldr lr,[sp],#4 @ pop lr bne .Labrt - mov r0,r2 @ asm_AES_set_encrypt_key preserves r2, + mov r0,r2 @ aes_nohw_set_encrypt_key preserves r2, mov r1,r2 @ which is AES_KEY *key b _armv4_AES_set_enc2dec_key -.size asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key +.size aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key @ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) .globl AES_set_enc2dec_key @@ -935,23 +954,23 @@ AES_Td: .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d .size AES_Td,.-AES_Td -@ void asm_AES_decrypt(const unsigned char *in, unsigned char *out, -@ const AES_KEY *key) { -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,%function +@ void aes_nohw_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,%function .align 5 -asm_AES_decrypt: +aes_nohw_decrypt: #ifndef __thumb2__ - sub r3,pc,#8 @ asm_AES_decrypt + sub r3,pc,#8 @ aes_nohw_decrypt #else adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r10,AES_Td #else - sub r10,r3,#asm_AES_decrypt-AES_Td @ Td + sub r10,r3,#aes_nohw_decrypt-AES_Td @ Td #endif mov r12,r0 @ inp mov r11,r2 @@ -1048,7 +1067,7 @@ asm_AES_decrypt: moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif -.size asm_AES_decrypt,.-asm_AES_decrypt +.size aes_nohw_decrypt,.-aes_nohw_decrypt .type _armv4_AES_decrypt,%function .align 2 @@ -1199,3 +1218,5 @@ _armv4_AES_decrypt: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S index 7c7ef19c79..5d6e22d029 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include #if __ARM_MAX_ARCH__>=7 @@ -13,6 +27,8 @@ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b +.text + .globl aes_hw_set_encrypt_key .hidden aes_hw_set_encrypt_key .type aes_hw_set_encrypt_key,%function @@ -761,3 +777,5 @@ aes_hw_ctr32_encrypt_blocks: .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S index e77a9ea613..029689475b 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S @@ -1,6 +1,24 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + .text #if defined(__thumb2__) .syntax unified @@ -167,14 +185,15 @@ bn_mul_mont: mov r4,sp @ "rewind" r4 sub r2,r2,r5 @ "rewind" r2 - and r1,r4,r14 - bic r3,r2,r14 - orr r1,r1,r3 @ ap=borrow?tp:rp - -.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh +.Lcopy: ldr r7,[r4] @ conditional copy + ldr r5,[r2] str sp,[r4],#4 @ zap tp - str r7,[r2],#4 - cmp r4,r0 +#ifdef __thumb2__ + it cc +#endif + movcc r5,r7 + str r5,[r2],#4 + teq r4,r0 @ preserve carry bne .Lcopy mov sp,r0 @@ -954,3 +973,5 @@ bn_mul8x_mont_neon: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S index f9c6de73ff..69a8fcacd0 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -14,8 +28,7 @@ @ details see http://www.openssl.org/~appro/cryptogams/. @ @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ . Permission to use under GPL terms is -@ granted. +@ of Linaro. Permission to use under GPL terms is granted. @ ==================================================================== @ Bit-sliced AES for ARM NEON @@ -49,10 +62,7 @@ @ @ April-August 2013 -@ -@ Add CBC, CTR and XTS subroutines, adapt for kernel use. -@ -@ +@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. #ifndef __KERNEL__ # include @@ -92,7 +102,7 @@ _bsaes_decrypt8: adr r6,. vldmia r4!, {q9} @ round 0 key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0ISR #else add r6,r6,#.LM0ISR-_bsaes_decrypt8 @@ -583,7 +593,7 @@ _bsaes_const: _bsaes_encrypt8: adr r6,. vldmia r4!, {q9} @ round 0 key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0SR #else sub r6,r6,#_bsaes_encrypt8-.LM0SR @@ -1018,7 +1028,7 @@ _bsaes_encrypt8_bitslice: _bsaes_key_convert: adr r6,. vld1.8 {q7}, [r4]! @ load round 0 key -#ifdef __APPLE__ +#if defined(__thumb2__) || defined(__APPLE__) adr r6,.LM0 #else sub r6,r6,#_bsaes_key_convert-.LM0 @@ -1072,24 +1082,13 @@ _bsaes_key_convert: @ don't save last round key bx lr .size _bsaes_key_convert,.-_bsaes_key_convert - - - .globl bsaes_cbc_encrypt .hidden bsaes_cbc_encrypt .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: -#ifndef __KERNEL__ - cmp r2, #128 -#ifndef __thumb__ - blo AES_cbc_encrypt -#else - bhs 1f - b AES_cbc_encrypt -1: -#endif -#endif + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. @ it is up to the caller to make sure we are called with enc == 0 @@ -1187,10 +1186,7 @@ bsaes_cbc_encrypt: adds r2, r2, #8 beq .Lcbc_dec_done - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo .Lcbc_dec_one - vld1.8 {q1}, [r0]! + @ Set up most parameters for the _bsaes_decrypt8 call. #ifndef BSAES_ASM_EXTENDED_KEY mov r4, sp @ pass the key #else @@ -1198,6 +1194,11 @@ bsaes_cbc_encrypt: #endif mov r5, r10 vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo .Lcbc_dec_one + vld1.8 {q1}, [r0]! beq .Lcbc_dec_two vld1.8 {q2}, [r0]! cmp r2, #4 @@ -1315,16 +1316,11 @@ bsaes_cbc_encrypt: .align 4 .Lcbc_dec_one: sub r0, r0, #0x10 - mov r10, r1 @ save original out pointer - mov r1, r9 @ use the iv scratch space as out buffer - mov r2, r3 - vmov q4,q15 @ just in case ensure that IV - vmov q5,q0 @ and input are preserved - bl AES_decrypt - vld1.8 {q0}, [r9] @ load result - veor q0, q0, q4 @ ^= IV - vmov q15, q5 @ q5 holds input - vst1.8 {q0}, [r10] @ write output + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output .Lcbc_dec_done: #ifndef BSAES_ASM_EXTENDED_KEY @@ -1342,15 +1338,13 @@ bsaes_cbc_encrypt: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - .globl bsaes_ctr32_encrypt_blocks .hidden bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: - cmp r2, #8 @ use plain AES for - blo .Lctr_enc_short @ small sizes - + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} VFP_ABI_PUSH @@ -1526,1042 +1520,10 @@ bsaes_ctr32_encrypt_blocks: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.align 4 -.Lctr_enc_short: - ldr ip, [sp] @ ctr pointer is passed on stack - stmdb sp!, {r4,r5,r6,r7,r8, lr} - - mov r4, r0 @ copy arguments - mov r5, r1 - mov r6, r2 - mov r7, r3 - ldr r8, [ip, #12] @ load counter .LSW - vld1.8 {q1}, [ip] @ load whole counter value -#ifdef __ARMEL__ - rev r8, r8 -#endif - sub sp, sp, #0x10 - vst1.8 {q1}, [sp] @ copy counter value - sub sp, sp, #0x10 - -.Lctr_enc_short_loop: - add r0, sp, #0x10 @ input counter value - mov r1, sp @ output on the stack - mov r2, r7 @ key - - bl AES_encrypt - - vld1.8 {q0}, [r4]! @ load input - vld1.8 {q1}, [sp] @ load encrypted counter - add r8, r8, #1 -#ifdef __ARMEL__ - rev r0, r8 - str r0, [sp, #0x1c] @ next counter value -#else - str r8, [sp, #0x1c] @ next counter value -#endif - veor q0,q0,q1 - vst1.8 {q0}, [r5]! @ store output - subs r6, r6, #1 - bne .Lctr_enc_short_loop - - vmov.i32 q0, #0 - vmov.i32 q1, #0 - vstmia sp!, {q0,q1} - - ldmia sp!, {r4,r5,r6,r7,r8, pc} + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.hidden bsaes_xts_encrypt -.type bsaes_xts_encrypt,%function -.align 4 -bsaes_xts_encrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl AES_encrypt - mov r0,sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} @ save last round key -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7, q7, q15 @ fix up last round key - vstmia r12, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - - vld1.8 {q8}, [r0] @ initial tweak - adr r2, .Lxts_magic - - subs r9, #0x80 - blo .Lxts_enc_short - b .Lxts_enc_loop - -.align 4 -.Lxts_enc_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl .Lxts_enc_loop - -.Lxts_enc_short: - adds r9, #0x70 - bmi .Lxts_enc_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi .Lxts_enc_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q2, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_6: - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done - -@ put this in range for both ARM and Thumb mode adr instructions -.align 5 -.Lxts_magic: -.quad 1, 0x87 - -.align 5 -.Lxts_enc_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - veor q10, q3, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q6, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q4, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_encrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_enc_done -.align 4 -.Lxts_enc_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - - vmov q8, q9 @ next round tweak - -.Lxts_enc_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq .Lxts_enc_ret - sub r6, r8, #0x10 - -.Lxts_enc_steal: - ldrb r0, [r7], #1 - ldrb r1, [r8, #-0x10] - strb r0, [r8, #-0x10] - strb r1, [r8], #1 - - subs r9, #1 - bhi .Lxts_enc_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_encrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -.Lxts_enc_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -.Lxts_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne .Lxts_enc_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.hidden bsaes_xts_decrypt -.type bsaes_xts_decrypt,%function -.align 4 -bsaes_xts_decrypt: - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 - VFP_ABI_PUSH - mov r6, sp @ future r3 - - mov r7, r0 - mov r8, r1 - mov r9, r2 - mov r10, r3 - - sub r0, sp, #0x10 @ 0x10 - bic r0, #0xf @ align at 16 bytes - mov sp, r0 - -#ifdef XTS_CHAIN_TWEAK - ldr r0, [ip] @ pointer to input tweak -#else - @ generate initial tweak - ldr r0, [ip, #4] @ iv[] - mov r1, sp - ldr r2, [ip, #0] @ key2 - bl AES_encrypt - mov r0, sp @ pointer to initial tweak -#endif - - ldr r1, [r10, #240] @ get # of rounds - mov r3, r6 -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key - @ add r12, #96 @ size of bit-sliced key schedule - sub r12, #48 @ place for tweak[9] - - @ populate the key schedule - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - mov sp, r12 - add r12, #0x90 @ pass key schedule - bl _bsaes_key_convert - add r4, sp, #0x90 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} -#else - ldr r12, [r10, #244] - eors r12, #1 - beq 0f - - str r12, [r10, #244] - mov r4, r10 @ pass key - mov r5, r1 @ pass # of rounds - add r12, r10, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r10, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - sub sp, #0x90 @ place for tweak[9] -#endif - vld1.8 {q8}, [r0] @ initial tweak - adr r2, .Lxts_magic - -#ifndef XTS_CHAIN_TWEAK - tst r9, #0xf @ if not multiple of 16 - it ne @ Thumb2 thing, sanity check in ARM - subne r9, #0x10 @ subtract another 16 bytes -#endif - subs r9, #0x80 - - blo .Lxts_dec_short - b .Lxts_dec_loop - -.align 4 -.Lxts_dec_loop: - vldmia r2, {q5} @ load XTS magic - vshr.s64 q6, q8, #63 - mov r0, sp - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q9, #63 - veor q9, q9, q6 - vand q7, q7, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q10, #63 - veor q10, q10, q7 - vand q6, q6, q5 - vld1.8 {q0}, [r7]! - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q11, #63 - veor q11, q11, q6 - vand q7, q7, q5 - vld1.8 {q1}, [r7]! - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q12, #63 - veor q12, q12, q7 - vand q6, q6, q5 - vld1.8 {q2}, [r7]! - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q13, #63 - veor q13, q13, q6 - vand q7, q7, q5 - vld1.8 {q3}, [r7]! - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q14, #63 - veor q14, q14, q7 - vand q6, q6, q5 - vld1.8 {q4}, [r7]! - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q15, #63 - veor q15, q15, q6 - vand q7, q7, q5 - vld1.8 {q5}, [r7]! - veor q4, q4, q12 - vadd.u64 q8, q15, q15 - vst1.64 {q15}, [r0,:128]! - vswp d15,d14 - veor q8, q8, q7 - vst1.64 {q8}, [r0,:128] @ next round tweak - - vld1.8 {q6,q7}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - veor q7, q7, q15 - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14,q15}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - veor q13, q5, q15 - vst1.8 {q12,q13}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - - subs r9, #0x80 - bpl .Lxts_dec_loop - -.Lxts_dec_short: - adds r9, #0x70 - bmi .Lxts_dec_done - - vldmia r2, {q5} @ load XTS magic - vshr.s64 q7, q8, #63 - mov r0, sp - vand q7, q7, q5 - vadd.u64 q9, q8, q8 - vst1.64 {q8}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q9, #63 - veor q9, q9, q7 - vand q6, q6, q5 - vadd.u64 q10, q9, q9 - vst1.64 {q9}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q10, #63 - veor q10, q10, q6 - vand q7, q7, q5 - vld1.8 {q0}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_1 - vadd.u64 q11, q10, q10 - vst1.64 {q10}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q11, #63 - veor q11, q11, q7 - vand q6, q6, q5 - vld1.8 {q1}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_2 - veor q0, q0, q8 - vadd.u64 q12, q11, q11 - vst1.64 {q11}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q12, #63 - veor q12, q12, q6 - vand q7, q7, q5 - vld1.8 {q2}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_3 - veor q1, q1, q9 - vadd.u64 q13, q12, q12 - vst1.64 {q12}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q13, #63 - veor q13, q13, q7 - vand q6, q6, q5 - vld1.8 {q3}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_4 - veor q2, q2, q10 - vadd.u64 q14, q13, q13 - vst1.64 {q13}, [r0,:128]! - vswp d13,d12 - vshr.s64 q7, q14, #63 - veor q14, q14, q6 - vand q7, q7, q5 - vld1.8 {q4}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_5 - veor q3, q3, q11 - vadd.u64 q15, q14, q14 - vst1.64 {q14}, [r0,:128]! - vswp d15,d14 - vshr.s64 q6, q15, #63 - veor q15, q15, q7 - vand q6, q6, q5 - vld1.8 {q5}, [r7]! - subs r9, #0x10 - bmi .Lxts_dec_6 - veor q4, q4, q12 - sub r9, #0x10 - vst1.64 {q15}, [r0,:128] @ next round tweak - - vld1.8 {q6}, [r7]! - veor q5, q5, q13 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q6, q6, q14 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vld1.64 {q14}, [r0,:128]! - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - veor q12, q3, q14 - vst1.8 {q10,q11}, [r8]! - vst1.8 {q12}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_6: - vst1.64 {q14}, [r0,:128] @ next round tweak - - veor q4, q4, q12 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q5, q5, q13 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12,q13}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - veor q11, q7, q13 - vst1.8 {q10,q11}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_5: - veor q3, q3, q11 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q4, q4, q12 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - vld1.64 {q12}, [r0,:128]! - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - veor q10, q2, q12 - vst1.8 {q8,q9}, [r8]! - vst1.8 {q10}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_4: - veor q2, q2, q10 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q3, q3, q11 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10,q11}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - veor q9, q4, q11 - vst1.8 {q8,q9}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_3: - veor q1, q1, q9 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q2, q2, q10 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - vld1.64 {q10}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - veor q8, q6, q10 - vst1.8 {q0,q1}, [r8]! - vst1.8 {q8}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_2: - veor q0, q0, q8 -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x90 @ pass key schedule -#else - add r4, r10, #248 @ pass key schedule -#endif - veor q1, q1, q9 - mov r5, r1 @ pass rounds - mov r0, sp - - bl _bsaes_decrypt8 - - vld1.64 {q8,q9}, [r0,:128]! - veor q0, q0, q8 - veor q1, q1, q9 - vst1.8 {q0,q1}, [r8]! - - vld1.64 {q8}, [r0,:128] @ next round tweak - b .Lxts_dec_done -.align 4 -.Lxts_dec_1: - mov r0, sp - veor q0, q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r5, r2 @ preserve magic - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r8]! - mov r3, r4 - mov r2, r5 - - vmov q8, q9 @ next round tweak - -.Lxts_dec_done: -#ifndef XTS_CHAIN_TWEAK - adds r9, #0x10 - beq .Lxts_dec_ret - - @ calculate one round of extra tweak for the stolen ciphertext - vldmia r2, {q5} - vshr.s64 q6, q8, #63 - vand q6, q6, q5 - vadd.u64 q9, q8, q8 - vswp d13,d12 - veor q9, q9, q6 - - @ perform the final decryption with the last tweak value - vld1.8 {q0}, [r7]! - mov r0, sp - veor q0, q0, q9 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - mov r4, r3 @ preserve fp - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q9 - vst1.8 {q0}, [r8] - - mov r6, r8 -.Lxts_dec_steal: - ldrb r1, [r8] - ldrb r0, [r7], #1 - strb r1, [r8, #0x10] - strb r0, [r8], #1 - - subs r9, #1 - bhi .Lxts_dec_steal - - vld1.8 {q0}, [r6] - mov r0, sp - veor q0, q8 - mov r1, sp - vst1.8 {q0}, [sp,:128] - mov r2, r10 - - bl AES_decrypt - - vld1.8 {q0}, [sp,:128] - veor q0, q0, q8 - vst1.8 {q0}, [r6] - mov r3, r4 -#endif - -.Lxts_dec_ret: - bic r0, r3, #0xf - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifdef XTS_CHAIN_TWEAK - ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak -#endif -.Lxts_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r0 - bne .Lxts_dec_bzero - - mov sp, r3 -#ifdef XTS_CHAIN_TWEAK - vst1.8 {q8}, [r1] -#endif - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S index 5f8b50d551..42cce5831f 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S @@ -1,9 +1,30 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + .text #if defined(__thumb2__) || defined(__clang__) .syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne #endif #if defined(__thumb2__) .thumb @@ -11,11 +32,6 @@ .code 32 #endif -#ifdef __clang__ -#define ldrplb ldrbpl -#define ldrneb ldrbne -#endif - .type rem_4bit,%object .align 5 rem_4bit: @@ -571,3 +587,5 @@ gcm_ghash_neon: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S index e83a9c7313..d6842945f0 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text @@ -109,13 +123,13 @@ gcm_ghash_v8: @ loaded value would have @ to be rotated in order to @ make it appear as in - @ alorithm specification + @ algorithm specification subs r3,r3,#32 @ see if r3 is 32 or larger mov r12,#16 @ r12 is used as post- @ increment for input pointer; @ as loop is modulo-scheduled @ r12 is zeroed just in time - @ to preclude oversteping + @ to preclude overstepping @ inp[len], which means that @ last block[s] are actually @ loaded twice, but last @@ -235,3 +249,5 @@ gcm_ghash_v8: .align 2 .align 2 #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S index a5d88f71e2..61deddf8e7 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif #include .text @@ -1493,3 +1507,5 @@ sha1_block_data_order_armv8: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S index f37fd7c7cf..aee04785c0 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -51,6 +65,11 @@ # define __ARM_MAX_ARCH__ 7 #endif +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) +.arch armv7-a + .text #if defined(__thumb2__) .syntax unified @@ -2816,3 +2835,5 @@ sha256_block_data_order_armv8: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S index bbeddf9220..a06d41fee5 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S @@ -1,4 +1,18 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) #if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif @ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the OpenSSL license (the "License"). You may not use @@ -64,6 +78,10 @@ # define VFP_ABI_POP #endif +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + #ifdef __ARMEL__ # define LO 0 # define HI 4 @@ -1872,3 +1890,5 @@ sha512_block_data_order_neon: .hidden OPENSSL_armcap_P #endif #endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S new file mode 100644 index 0000000000..e5ad6ed99b --- /dev/null +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S @@ -0,0 +1,1236 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + +.arch armv7-a +.fpu neon + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + +.type _vpaes_consts,%object +.align 7 @ totally strategic alignment +_vpaes_consts: +.Lk_mc_forward:@ mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:@ mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr:@ sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +.Lk_inv:@ inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt:@ input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo:@ sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1:@ sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2:@ sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +.type _vpaes_preheat,%function +.align 4 +_vpaes_preheat: + adr r10, .Lk_inv + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10]! @ .Lk_inv + add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo + vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 + vld1.64 {q14,q15}, [r10] @ .Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [r2] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + adr r11, .Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, .Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 d3, {q2}, d3 + vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 d5, {q3}, d1 + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b .Lenc_entry + +.align 4 +.Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 d9, {q13}, d5 + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 d1, {q12}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 d11, {q15}, d5 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 d5, {q14}, d7 + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 d7, {q0}, d3 + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 d11, {q0}, d9 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 d9, {q3}, d3 + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +.Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 d11, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne .Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, .Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 d5, {q0}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 d1, {q2}, d3 + bx lr +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack + @ alignment. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_encrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_encrypt,.-vpaes_encrypt + +@ +@ Decryption stuff +@ +.type _vpaes_decrypt_consts,%object +.align 4 +_vpaes_decrypt_consts: +.Lk_dipt:@ decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo:@ decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9:@ decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd:@ decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb:@ decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe:@ decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts + +@@ +@@ Decryption core +@@ +@@ Same API as encryption core, except it clobbers q12-q15 rather than using +@@ the values from _vpaes_preheat. q9-q11 must still be set from +@@ _vpaes_preheat. +@@ +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + + @ This function performs shuffles with various constants. The x86_64 + @ version loads them on-demand into %xmm0-%xmm5. This does not work well + @ for ARMv7 because those registers are shuffle destinations. The ARMv8 + @ version preloads those constants into registers, but ARMv7 has half + @ the registers to work with. Instead, we load them on-demand into + @ q12-q15, registers normally use for preloaded constants. This is fine + @ because decryption doesn't use those constants. The values are + @ constant, so this does not interfere with potential 2x optimizations. + adr r7, .Lk_dipt + + vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 + eor r11, r11, #0x30 @ xor $0x30, %r11 + adr r10, .Lk_sr + and r11, r11, #0x30 @ and $0x30, %r11 + add r11, r11, r10 + adr r10, .Lk_mc_forward+48 + + vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q12}, d3 + vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q13}, d1 + veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Ldec_entry ends with a bnz instruction which is normally paired with + @ subs in .Ldec_loop. + tst r8, r8 + b .Ldec_entry + +.align 4 +.Ldec_loop: +@ +@ Inverse mix columns +@ + + @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of + @ the function. + adr r10, .Lk_dsb9 + vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + vtbl.8 d9, {q12}, d5 + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + vtbl.8 d3, {q13}, d7 + veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 + + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + @ Load sbb* ahead of time. + vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu + @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + vtbl.8 d3, {q15}, d7 + @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu + @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet + + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + vtbl.8 d9, {q12}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + vtbl.8 d3, {q13}, d7 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + vtbl.8 d3, {q15}, d7 + vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + subs r8, r8, #1 @ sub $1,%rax # nr-- + +.Ldec_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 + bne .Ldec_loop + + @ middle of last round + + adr r10, .Lk_dsbo + + @ Write to q1 rather than q4 to avoid overlapping table and destination. + vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + @ Write to q2 rather than q1 to avoid overlapping table and destination. + vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + vtbl.8 d3, {q2}, d7 + vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + @ Write to q1 rather than q0 so the table and destination registers + @ below do not overlap. + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A + vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 + vtbl.8 d1, {q1}, d5 + bx lr +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + @ _vpaes_decrypt_core uses r7-r11. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_decrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_decrypt,.-vpaes_decrypt +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ +.type _vpaes_key_consts,%object +.align 4 +_vpaes_key_consts: +.Lk_dksd:@ decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb:@ decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9:@ decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon:@ rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt:@ output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew:@ deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 +.size _vpaes_key_consts,.-_vpaes_key_consts + +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr r11, .Lk_rcon + vmov.i8 q12, #0x5b @ .Lk_s63 + adr r10, .Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10] @ .Lk_inv + vld1.64 {q8}, [r11] @ .Lk_rcon + bx lr +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, .Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + tst r3, r3 + bne .Lschedule_am_decrypting + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + @ decrypting, output zeroth round key after shiftrows + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q4}, d3 + vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) + eor r8, r8, #0x30 @ xor $0x30, %r8 + +.Lschedule_go: + cmp r1, #192 @ cmp $192, %esi + bhi .Lschedule_256 + beq .Lschedule_192 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +.Lschedule_128: + mov r0, #10 @ mov $10, %esi + +.Loop_schedule_128: + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b .Loop_schedule_128 + +@@ +@@ .aes_schedule_192 +@@ +@@ 192-bit specific part of key schedule. +@@ +@@ The main body of this schedule is the same as the 128-bit +@@ schedule, but with more smearing. The long, high side is +@@ stored in q7 as before, and the short, low side is in +@@ the high bits of q6. +@@ +@@ This schedule is somewhat nastier, however, because each +@@ round produces 192 bits of key material, or 1.5 round keys. +@@ Therefore, on each cycle we do 2 rounds and produce 3 round +@@ keys. +@@ +.align 4 +.Lschedule_192: + sub r0, r0, #8 + vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform @ input transform + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part + vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 + @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov r0, #4 @ mov $4, %esi + +.Loop_schedule_192: + bl _vpaes_schedule_round + vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle @ save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle @ save key n+1 + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +.Lschedule_256: + vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov r0, #7 @ mov $7, %esi + +.Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +.Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew + tst r3, r3 + bne .Lschedule_mangle_last_dec + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform + add r2, r2, #32 @ add $32, %rdx + vmov q2, q0 + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 d1, {q2}, d3 + +.Lschedule_mangle_last_dec: + sub r2, r2, #16 @ add $-16, %rdx + veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ +@@ .aes_schedule_192_smear +@@ +@@ Smear the short, low side in the 192-bit key schedule. +@@ +@@ Inputs: +@@ q7: high side, b a x y +@@ q6: low side, d c 0 0 +@@ +@@ Outputs: +@@ q6: b+c+d b+c 0 0 +@@ q0: b+c+d b+c b a +@@ +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + vmov.i8 q1, #0 + vdup.32 q0, d15[1] + vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + vmov q0, q6 @ vmovdqa %xmm6, %xmm0 + vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + bx lr +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 + vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, .Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 d7, {q10}, d7 + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 d5, {q10}, d9 + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 d9, {q15}, d7 + vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 d3, {q14}, d5 + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d3 + vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q15}, d1 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + tst r3, r3 + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 + bne .Lschedule_mangle_dec + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add r2, r2, #16 @ add $16, %rdx + vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 d9, {q2}, d11 + vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 d3, {q4}, d11 + vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 d7, {q1}, d11 + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + @ inverse mix columns + adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11 + vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi + vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 + @ vmovdqa 0x10(%r11), %xmm3 + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dksb ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 + @ vmovdqa 0x30(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 + @ vmovdqa 0x50(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 + @ vmovdqa 0x70(%r11), %xmm4 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 + vtbl.8 d9, {q15}, d3 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 + + sub r2, r2, #16 @ add $-16, %rdx + +.Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d5, {q3}, d3 + add r8, r8, #64-16 @ add $-16, %r8 + and r8, r8, #~(1<<6) @ and $0x30, %r8 + vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) + bx lr +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov r3, #0 @ mov $0,%ecx + mov r8, #0x30 @ mov $0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl r9, r9, #4 @ shl $4,%eax + add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx + add r2, r2, r9 + + mov r3, #1 @ mov $1,%ecx + lsr r8, r1, #1 @ shr $1,%r8d + and r8, r8, #32 @ and $32,%r8d + eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +@ Additional constants for converting to bsaes. +.type _vpaes_convert_consts,%object +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) +.Lk_opt_then_skew: +.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b +.quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation +@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 +@ becomes 0x22334411 and then 0x11443322. +.Lk_decrypt_transform: +.quad 0x0704050603000102, 0x0f0c0d0e0b08090a +.size _vpaes_convert_consts,.-_vpaes_convert_consts + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl vpaes_encrypt_key_to_bsaes +.hidden vpaes_encrypt_key_to_bsaes +.type vpaes_encrypt_key_to_bsaes,%function +.align 4 +vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {q12}, [r2] + vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 + adr r11, .Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +.Loop_enc_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + vmov q0, q1 + vtbl.8 d4, {q1}, d24 + vtbl.8 d5, {q1}, d25 + veor q0, q0, q2 + vtbl.8 d2, {q2}, d24 + vtbl.8 d3, {q2}, d25 + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, q10 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + b .Loop_enc_key_to_bsaes + +.Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ q11, not q10. + veor q0, q0, q11 + vrev32.8 q0, q0 + vst1.64 {q0}, [r0] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes + +@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); +.globl vpaes_decrypt_key_to_bsaes +.hidden vpaes_decrypt_key_to_bsaes +.type vpaes_decrypt_key_to_bsaes,%function +.align 4 +vpaes_decrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. Note vpaes + @ computes the decryption key schedule in reverse. Additionally, + @ aes-x86_64.pl shares some transformations, so we must only partially + @ invert vpaes's transformations. In general, vpaes computes in a + @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of + @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is + @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + adr r2, .Lk_decrypt_transform + adr r3, .Lk_sr+0x30 + adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform. + vld1.64 {q12}, [r2] @ Reuse q12 from encryption. + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ Undo the basis change and reapply the S-box affine transform. See + @ .Lschedule_mangle_last. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ See _vpaes_schedule_mangle for the transform on the middle keys. Note + @ it simultaneously inverts MixColumns and the S-box affine transform. + @ See .Lk_dksd through .Lk_dks9. +.Loop_dec_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going + @ forwards cancels inverting for which direction we cycle r3. We use r3 + @ rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #64-16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_dec_key_to_bsaes_last + + @ Undo the basis change and reapply the S-box affine transform. + bl _vpaes_schedule_transform + + @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We + @ combine the two operations in .Lk_decrypt_transform. + @ + @ TODO(davidben): Where does the rotation come from? + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + + vst1.64 {q1}, [r0]! + b .Loop_dec_key_to_bsaes + +.Loop_dec_key_to_bsaes_last: + @ The final key only inverts ShiftRows (already done in the loop). See + @ .Lschedule_am_decrypting. Its basis is not transformed. + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7,r8,r9,r10,r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + cmp r2, #0 + @ r8 is passed on the stack. + ldr r8, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. + mov r9, r3 + mov r3, r2 + mov r2, r9 + + @ Load the IV and counter portion. + ldr r7, [r8, #12] + vld1.8 {q7}, [r8] + + bl _vpaes_preheat + rev r7, r7 @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [r0]! @ .Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [r1]! + subs r3, r3, #1 + @ Update the counter. + add r7, r7, #1 + rev r9, r7 + vmov.32 d15[1], r9 + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S new file mode 100644 index 0000000000..5c788b3569 --- /dev/null +++ b/packager/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S @@ -0,0 +1,380 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__arm__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.syntax unified + +.arch armv7-a +.fpu vfp + +.text + +@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| +@ with |argv|, then saves the callee-saved registers into |state|. It returns +@ the result of |func|. The |unwind| argument is unused. +@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, +@ const uint32_t *argv, size_t argc, +@ int unwind); +.type abi_test_trampoline, %function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 4 +abi_test_trampoline: +.Labi_test_trampoline_begin: + @ Save parameters and all callee-saved registers. For convenience, we + @ save r9 on iOS even though it's volatile. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} + + @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 + @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). + sub sp, sp, #28 + + @ Every register in AAPCS is either non-volatile or a parameter (except + @ r9 on iOS), so this code, by the actual call, loses all its scratch + @ registers. First fill in stack parameters while there are registers + @ to spare. + cmp r3, #4 + bls .Lstack_args_done + mov r4, sp @ r4 is the output pointer. + add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. + add r2, r2, #16 @ Skip four arguments. +.Lstack_args_loop: + ldr r6, [r2], #4 + cmp r2, r5 + str r6, [r4], #4 + bne .Lstack_args_loop + +.Lstack_args_done: + @ Load registers from |r1|. + vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Load register parameters. This uses up our remaining registers, so we + @ repurpose lr as scratch space. + ldr r3, [sp, #40] @ Reload argc. + ldr lr, [sp, #36] @ .Load argv into lr. + cmp r3, #3 + bhi .Larg_r3 + beq .Larg_r2 + cmp r3, #1 + bhi .Larg_r1 + beq .Larg_r0 + b .Largs_done + +.Larg_r3: + ldr r3, [lr, #12] @ argv[3] +.Larg_r2: + ldr r2, [lr, #8] @ argv[2] +.Larg_r1: + ldr r1, [lr, #4] @ argv[1] +.Larg_r0: + ldr r0, [lr] @ argv[0] +.Largs_done: + + @ With every other register in use, load the function pointer into lr + @ and call the function. + ldr lr, [sp, #28] + blx lr + + @ r1-r3 are free for use again. The trampoline only supports + @ single-return functions. Pass r4-r11 to the caller. + ldr r1, [sp, #32] + vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + stmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Unwind the stack and restore registers. + add sp, sp, #44 @ 44 = 28+16 + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + bx lr +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_r0, %function +.globl abi_test_clobber_r0 +.hidden abi_test_clobber_r0 +.align 4 +abi_test_clobber_r0: + mov r0, #0 + bx lr +.size abi_test_clobber_r0,.-abi_test_clobber_r0 +.type abi_test_clobber_r1, %function +.globl abi_test_clobber_r1 +.hidden abi_test_clobber_r1 +.align 4 +abi_test_clobber_r1: + mov r1, #0 + bx lr +.size abi_test_clobber_r1,.-abi_test_clobber_r1 +.type abi_test_clobber_r2, %function +.globl abi_test_clobber_r2 +.hidden abi_test_clobber_r2 +.align 4 +abi_test_clobber_r2: + mov r2, #0 + bx lr +.size abi_test_clobber_r2,.-abi_test_clobber_r2 +.type abi_test_clobber_r3, %function +.globl abi_test_clobber_r3 +.hidden abi_test_clobber_r3 +.align 4 +abi_test_clobber_r3: + mov r3, #0 + bx lr +.size abi_test_clobber_r3,.-abi_test_clobber_r3 +.type abi_test_clobber_r4, %function +.globl abi_test_clobber_r4 +.hidden abi_test_clobber_r4 +.align 4 +abi_test_clobber_r4: + mov r4, #0 + bx lr +.size abi_test_clobber_r4,.-abi_test_clobber_r4 +.type abi_test_clobber_r5, %function +.globl abi_test_clobber_r5 +.hidden abi_test_clobber_r5 +.align 4 +abi_test_clobber_r5: + mov r5, #0 + bx lr +.size abi_test_clobber_r5,.-abi_test_clobber_r5 +.type abi_test_clobber_r6, %function +.globl abi_test_clobber_r6 +.hidden abi_test_clobber_r6 +.align 4 +abi_test_clobber_r6: + mov r6, #0 + bx lr +.size abi_test_clobber_r6,.-abi_test_clobber_r6 +.type abi_test_clobber_r7, %function +.globl abi_test_clobber_r7 +.hidden abi_test_clobber_r7 +.align 4 +abi_test_clobber_r7: + mov r7, #0 + bx lr +.size abi_test_clobber_r7,.-abi_test_clobber_r7 +.type abi_test_clobber_r8, %function +.globl abi_test_clobber_r8 +.hidden abi_test_clobber_r8 +.align 4 +abi_test_clobber_r8: + mov r8, #0 + bx lr +.size abi_test_clobber_r8,.-abi_test_clobber_r8 +.type abi_test_clobber_r9, %function +.globl abi_test_clobber_r9 +.hidden abi_test_clobber_r9 +.align 4 +abi_test_clobber_r9: + mov r9, #0 + bx lr +.size abi_test_clobber_r9,.-abi_test_clobber_r9 +.type abi_test_clobber_r10, %function +.globl abi_test_clobber_r10 +.hidden abi_test_clobber_r10 +.align 4 +abi_test_clobber_r10: + mov r10, #0 + bx lr +.size abi_test_clobber_r10,.-abi_test_clobber_r10 +.type abi_test_clobber_r11, %function +.globl abi_test_clobber_r11 +.hidden abi_test_clobber_r11 +.align 4 +abi_test_clobber_r11: + mov r11, #0 + bx lr +.size abi_test_clobber_r11,.-abi_test_clobber_r11 +.type abi_test_clobber_r12, %function +.globl abi_test_clobber_r12 +.hidden abi_test_clobber_r12 +.align 4 +abi_test_clobber_r12: + mov r12, #0 + bx lr +.size abi_test_clobber_r12,.-abi_test_clobber_r12 +.type abi_test_clobber_d0, %function +.globl abi_test_clobber_d0 +.hidden abi_test_clobber_d0 +.align 4 +abi_test_clobber_d0: + mov r0, #0 + vmov s0, r0 + vmov s1, r0 + bx lr +.size abi_test_clobber_d0,.-abi_test_clobber_d0 +.type abi_test_clobber_d1, %function +.globl abi_test_clobber_d1 +.hidden abi_test_clobber_d1 +.align 4 +abi_test_clobber_d1: + mov r0, #0 + vmov s2, r0 + vmov s3, r0 + bx lr +.size abi_test_clobber_d1,.-abi_test_clobber_d1 +.type abi_test_clobber_d2, %function +.globl abi_test_clobber_d2 +.hidden abi_test_clobber_d2 +.align 4 +abi_test_clobber_d2: + mov r0, #0 + vmov s4, r0 + vmov s5, r0 + bx lr +.size abi_test_clobber_d2,.-abi_test_clobber_d2 +.type abi_test_clobber_d3, %function +.globl abi_test_clobber_d3 +.hidden abi_test_clobber_d3 +.align 4 +abi_test_clobber_d3: + mov r0, #0 + vmov s6, r0 + vmov s7, r0 + bx lr +.size abi_test_clobber_d3,.-abi_test_clobber_d3 +.type abi_test_clobber_d4, %function +.globl abi_test_clobber_d4 +.hidden abi_test_clobber_d4 +.align 4 +abi_test_clobber_d4: + mov r0, #0 + vmov s8, r0 + vmov s9, r0 + bx lr +.size abi_test_clobber_d4,.-abi_test_clobber_d4 +.type abi_test_clobber_d5, %function +.globl abi_test_clobber_d5 +.hidden abi_test_clobber_d5 +.align 4 +abi_test_clobber_d5: + mov r0, #0 + vmov s10, r0 + vmov s11, r0 + bx lr +.size abi_test_clobber_d5,.-abi_test_clobber_d5 +.type abi_test_clobber_d6, %function +.globl abi_test_clobber_d6 +.hidden abi_test_clobber_d6 +.align 4 +abi_test_clobber_d6: + mov r0, #0 + vmov s12, r0 + vmov s13, r0 + bx lr +.size abi_test_clobber_d6,.-abi_test_clobber_d6 +.type abi_test_clobber_d7, %function +.globl abi_test_clobber_d7 +.hidden abi_test_clobber_d7 +.align 4 +abi_test_clobber_d7: + mov r0, #0 + vmov s14, r0 + vmov s15, r0 + bx lr +.size abi_test_clobber_d7,.-abi_test_clobber_d7 +.type abi_test_clobber_d8, %function +.globl abi_test_clobber_d8 +.hidden abi_test_clobber_d8 +.align 4 +abi_test_clobber_d8: + mov r0, #0 + vmov s16, r0 + vmov s17, r0 + bx lr +.size abi_test_clobber_d8,.-abi_test_clobber_d8 +.type abi_test_clobber_d9, %function +.globl abi_test_clobber_d9 +.hidden abi_test_clobber_d9 +.align 4 +abi_test_clobber_d9: + mov r0, #0 + vmov s18, r0 + vmov s19, r0 + bx lr +.size abi_test_clobber_d9,.-abi_test_clobber_d9 +.type abi_test_clobber_d10, %function +.globl abi_test_clobber_d10 +.hidden abi_test_clobber_d10 +.align 4 +abi_test_clobber_d10: + mov r0, #0 + vmov s20, r0 + vmov s21, r0 + bx lr +.size abi_test_clobber_d10,.-abi_test_clobber_d10 +.type abi_test_clobber_d11, %function +.globl abi_test_clobber_d11 +.hidden abi_test_clobber_d11 +.align 4 +abi_test_clobber_d11: + mov r0, #0 + vmov s22, r0 + vmov s23, r0 + bx lr +.size abi_test_clobber_d11,.-abi_test_clobber_d11 +.type abi_test_clobber_d12, %function +.globl abi_test_clobber_d12 +.hidden abi_test_clobber_d12 +.align 4 +abi_test_clobber_d12: + mov r0, #0 + vmov s24, r0 + vmov s25, r0 + bx lr +.size abi_test_clobber_d12,.-abi_test_clobber_d12 +.type abi_test_clobber_d13, %function +.globl abi_test_clobber_d13 +.hidden abi_test_clobber_d13 +.align 4 +abi_test_clobber_d13: + mov r0, #0 + vmov s26, r0 + vmov s27, r0 + bx lr +.size abi_test_clobber_d13,.-abi_test_clobber_d13 +.type abi_test_clobber_d14, %function +.globl abi_test_clobber_d14 +.hidden abi_test_clobber_d14 +.align 4 +abi_test_clobber_d14: + mov r0, #0 + vmov s28, r0 + vmov s29, r0 + bx lr +.size abi_test_clobber_d14,.-abi_test_clobber_d14 +.type abi_test_clobber_d15, %function +.globl abi_test_clobber_d15 +.hidden abi_test_clobber_d15 +.align 4 +abi_test_clobber_d15: + mov r0, #0 + vmov s30, r0 + vmov s31, r0 + bx lr +.size abi_test_clobber_d15,.-abi_test_clobber_d15 +#endif +#endif // !OPENSSL_NO_ASM +.section .note.GNU-stack,"",%progbits diff --git a/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S new file mode 100644 index 0000000000..86b06fc2ef --- /dev/null +++ b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S @@ -0,0 +1,3670 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) +.machine "any" + +.abiversion 2 +.text + +.align 7 +.Lrcon: +.byte 0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01 +.byte 0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b +.byte 0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d +.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.Lconsts: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,-0x48 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.byte 65,69,83,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.globl aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function +.align 5 +aes_hw_set_encrypt_key: +.localentry aes_hw_set_encrypt_key,0 + +.Lset_encrypt_key: + mflr 11 + std 11,16(1) + + li 6,-1 + cmpldi 3,0 + beq- .Lenc_key_abort + cmpldi 5,0 + beq- .Lenc_key_abort + li 6,-2 + cmpwi 4,128 + blt- .Lenc_key_abort + cmpwi 4,256 + bgt- .Lenc_key_abort + andi. 0,4,0x3f + bne- .Lenc_key_abort + + lis 0,0xfff0 + li 12,-1 + or 0,0,0 + + bl .Lconsts + mtlr 11 + + neg 9,3 + lvx 1,0,3 + addi 3,3,15 + lvsr 3,0,9 + li 8,0x20 + cmpwi 4,192 + lvx 2,0,3 + vspltisb 5,0x0f + lvx 4,0,6 + vxor 3,3,5 + lvx 5,8,6 + addi 6,6,0x10 + vperm 1,1,2,3 + li 7,8 + vxor 0,0,0 + mtctr 7 + + lvsl 8,0,5 + vspltisb 9,-1 + lvx 10,0,5 + vperm 9,9,0,8 + + blt .Loop128 + addi 3,3,8 + beq .L192 + addi 3,3,8 + b .L256 + +.align 4 +.Loop128: + vperm 3,1,1,5 + vsldoi 6,0,1,12 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vadduwm 4,4,4 + vxor 1,1,3 + bdnz .Loop128 + + lvx 4,0,6 + + vperm 3,1,1,5 + vsldoi 6,0,1,12 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vadduwm 4,4,4 + vxor 1,1,3 + + vperm 3,1,1,5 + vsldoi 6,0,1,12 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vxor 1,1,3 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + + addi 3,5,15 + addi 5,5,0x50 + + li 8,10 + b .Ldone + +.align 4 +.L192: + lvx 6,0,3 + li 7,4 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 5,5,16 + vperm 2,2,6,3 + vspltisb 3,8 + mtctr 7 + vsububm 5,5,3 + +.Loop192: + vperm 3,2,2,5 + vsldoi 6,0,1,12 + .long 0x10632509 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + + vsldoi 7,0,2,8 + vspltw 6,1,3 + vxor 6,6,2 + vsldoi 2,0,2,12 + vadduwm 4,4,4 + vxor 2,2,6 + vxor 1,1,3 + vxor 2,2,3 + vsldoi 7,7,1,8 + + vperm 3,2,2,5 + vsldoi 6,0,1,12 + vperm 11,7,7,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vsldoi 7,1,2,8 + vxor 1,1,6 + vsldoi 6,0,6,12 + vperm 11,7,7,8 + vsel 7,10,11,9 + vor 10,11,11 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + stvx 7,0,5 + addi 5,5,16 + + vspltw 6,1,3 + vxor 6,6,2 + vsldoi 2,0,2,12 + vadduwm 4,4,4 + vxor 2,2,6 + vxor 1,1,3 + vxor 2,2,3 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 3,5,15 + addi 5,5,16 + bdnz .Loop192 + + li 8,12 + addi 5,5,0x20 + b .Ldone + +.align 4 +.L256: + lvx 6,0,3 + li 7,7 + li 8,14 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 5,5,16 + vperm 2,2,6,3 + mtctr 7 + +.Loop256: + vperm 3,2,2,5 + vsldoi 6,0,1,12 + vperm 11,2,2,8 + vsel 7,10,11,9 + vor 10,11,11 + .long 0x10632509 + stvx 7,0,5 + addi 5,5,16 + + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vsldoi 6,0,6,12 + vxor 1,1,6 + vadduwm 4,4,4 + vxor 1,1,3 + vperm 11,1,1,8 + vsel 7,10,11,9 + vor 10,11,11 + stvx 7,0,5 + addi 3,5,15 + addi 5,5,16 + bdz .Ldone + + vspltw 3,1,3 + vsldoi 6,0,2,12 + .long 0x106305C8 + + vxor 2,2,6 + vsldoi 6,0,6,12 + vxor 2,2,6 + vsldoi 6,0,6,12 + vxor 2,2,6 + + vxor 2,2,3 + b .Loop256 + +.align 4 +.Ldone: + lvx 2,0,3 + vsel 2,10,2,9 + stvx 2,0,3 + li 6,0 + or 12,12,12 + stw 8,0(5) + +.Lenc_key_abort: + mr 3,6 + blr +.long 0 +.byte 0,12,0x14,1,0,0,3,0 +.long 0 +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key + +.globl aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function +.align 5 +aes_hw_set_decrypt_key: +.localentry aes_hw_set_decrypt_key,0 + + stdu 1,-64(1) + mflr 10 + std 10,80(1) + bl .Lset_encrypt_key + mtlr 10 + + cmpwi 3,0 + bne- .Ldec_key_abort + + slwi 7,8,4 + subi 3,5,240 + srwi 8,8,1 + add 5,3,7 + mtctr 8 + +.Ldeckey: + lwz 0, 0(3) + lwz 6, 4(3) + lwz 7, 8(3) + lwz 8, 12(3) + addi 3,3,16 + lwz 9, 0(5) + lwz 10,4(5) + lwz 11,8(5) + lwz 12,12(5) + stw 0, 0(5) + stw 6, 4(5) + stw 7, 8(5) + stw 8, 12(5) + subi 5,5,16 + stw 9, -16(3) + stw 10,-12(3) + stw 11,-8(3) + stw 12,-4(3) + bdnz .Ldeckey + + xor 3,3,3 +.Ldec_key_abort: + addi 1,1,64 + blr +.long 0 +.byte 0,12,4,1,0x80,0,3,0 +.long 0 +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_encrypt +.type aes_hw_encrypt,@function +.align 5 +aes_hw_encrypt: +.localentry aes_hw_encrypt,0 + + lwz 6,240(5) + lis 0,0xfc00 + li 12,-1 + li 7,15 + or 0,0,0 + + lvx 0,0,3 + neg 11,4 + lvx 1,7,3 + lvsl 2,0,3 + vspltisb 4,0x0f + lvsr 3,0,11 + vxor 2,2,4 + li 7,16 + vperm 0,0,1,2 + lvx 1,0,5 + lvsr 5,0,5 + srwi 6,6,1 + lvx 2,7,5 + addi 7,7,16 + subi 6,6,1 + vperm 1,2,1,5 + + vxor 0,0,1 + lvx 1,7,5 + addi 7,7,16 + mtctr 6 + +.Loop_enc: + vperm 2,1,2,5 + .long 0x10001508 + lvx 2,7,5 + addi 7,7,16 + vperm 1,2,1,5 + .long 0x10000D08 + lvx 1,7,5 + addi 7,7,16 + bdnz .Loop_enc + + vperm 2,1,2,5 + .long 0x10001508 + lvx 2,7,5 + vperm 1,2,1,5 + .long 0x10000D09 + + vspltisb 2,-1 + vxor 1,1,1 + li 7,15 + vperm 2,2,1,3 + vxor 3,3,4 + lvx 1,0,4 + vperm 0,0,0,3 + vsel 1,1,0,2 + lvx 4,7,4 + stvx 1,0,4 + vsel 0,0,4,2 + stvx 0,7,4 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size aes_hw_encrypt,.-aes_hw_encrypt +.globl aes_hw_decrypt +.type aes_hw_decrypt,@function +.align 5 +aes_hw_decrypt: +.localentry aes_hw_decrypt,0 + + lwz 6,240(5) + lis 0,0xfc00 + li 12,-1 + li 7,15 + or 0,0,0 + + lvx 0,0,3 + neg 11,4 + lvx 1,7,3 + lvsl 2,0,3 + vspltisb 4,0x0f + lvsr 3,0,11 + vxor 2,2,4 + li 7,16 + vperm 0,0,1,2 + lvx 1,0,5 + lvsr 5,0,5 + srwi 6,6,1 + lvx 2,7,5 + addi 7,7,16 + subi 6,6,1 + vperm 1,2,1,5 + + vxor 0,0,1 + lvx 1,7,5 + addi 7,7,16 + mtctr 6 + +.Loop_dec: + vperm 2,1,2,5 + .long 0x10001548 + lvx 2,7,5 + addi 7,7,16 + vperm 1,2,1,5 + .long 0x10000D48 + lvx 1,7,5 + addi 7,7,16 + bdnz .Loop_dec + + vperm 2,1,2,5 + .long 0x10001548 + lvx 2,7,5 + vperm 1,2,1,5 + .long 0x10000D49 + + vspltisb 2,-1 + vxor 1,1,1 + li 7,15 + vperm 2,2,1,3 + vxor 3,3,4 + lvx 1,0,4 + vperm 0,0,0,3 + vsel 1,1,0,2 + lvx 4,7,4 + stvx 1,0,4 + vsel 0,0,4,2 + stvx 0,7,4 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size aes_hw_decrypt,.-aes_hw_decrypt +.globl aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function +.align 5 +aes_hw_cbc_encrypt: +.localentry aes_hw_cbc_encrypt,0 + + cmpldi 5,16 + .long 0x4dc00020 + + cmpwi 8,0 + lis 0,0xffe0 + li 12,-1 + or 0,0,0 + + li 10,15 + vxor 0,0,0 + vspltisb 3,0x0f + + lvx 4,0,7 + lvsl 6,0,7 + lvx 5,10,7 + vxor 6,6,3 + vperm 4,4,5,6 + + neg 11,3 + lvsr 10,0,6 + lwz 9,240(6) + + lvsr 6,0,11 + lvx 5,0,3 + addi 3,3,15 + vxor 6,6,3 + + lvsl 8,0,4 + vspltisb 9,-1 + lvx 7,0,4 + vperm 9,9,0,8 + vxor 8,8,3 + + srwi 9,9,1 + li 10,16 + subi 9,9,1 + beq .Lcbc_dec + +.Lcbc_enc: + vor 2,5,5 + lvx 5,0,3 + addi 3,3,16 + mtctr 9 + subi 5,5,16 + + lvx 0,0,6 + vperm 2,2,5,6 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + vxor 2,2,0 + lvx 0,10,6 + addi 10,10,16 + vxor 2,2,4 + +.Loop_cbc_enc: + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + .long 0x10420508 + lvx 0,10,6 + addi 10,10,16 + bdnz .Loop_cbc_enc + + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + li 10,16 + vperm 0,1,0,10 + .long 0x10820509 + cmpldi 5,16 + + vperm 3,4,4,8 + vsel 2,7,3,9 + vor 7,3,3 + stvx 2,0,4 + addi 4,4,16 + bge .Lcbc_enc + + b .Lcbc_done + +.align 4 +.Lcbc_dec: + cmpldi 5,128 + bge _aesp8_cbc_decrypt8x + vor 3,5,5 + lvx 5,0,3 + addi 3,3,16 + mtctr 9 + subi 5,5,16 + + lvx 0,0,6 + vperm 3,3,5,6 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + vxor 2,3,0 + lvx 0,10,6 + addi 10,10,16 + +.Loop_cbc_dec: + vperm 1,0,1,10 + .long 0x10420D48 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + .long 0x10420548 + lvx 0,10,6 + addi 10,10,16 + bdnz .Loop_cbc_dec + + vperm 1,0,1,10 + .long 0x10420D48 + lvx 1,10,6 + li 10,16 + vperm 0,1,0,10 + .long 0x10420549 + cmpldi 5,16 + + vxor 2,2,4 + vor 4,3,3 + vperm 3,2,2,8 + vsel 2,7,3,9 + vor 7,3,3 + stvx 2,0,4 + addi 4,4,16 + bge .Lcbc_dec + +.Lcbc_done: + addi 4,4,-1 + lvx 2,0,4 + vsel 2,7,2,9 + stvx 2,0,4 + + neg 8,7 + li 10,15 + vxor 0,0,0 + vspltisb 9,-1 + vspltisb 3,0x0f + lvsr 8,0,8 + vperm 9,9,0,8 + vxor 8,8,3 + lvx 7,0,7 + vperm 4,4,4,8 + vsel 2,7,4,9 + lvx 5,10,7 + stvx 2,0,7 + vsel 2,4,5,9 + stvx 2,10,7 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,6,0 +.long 0 +.align 5 +_aesp8_cbc_decrypt8x: + stdu 1,-448(1) + li 10,207 + li 11,223 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,11,1 + addi 11,11,32 + stvx 24,10,1 + addi 10,10,32 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 0,-1 + stw 12,396(1) + li 8,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + subi 5,5,128 + + lvx 23,0,6 + lvx 30,8,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,10 + addi 11,1,79 + mtctr 9 + +.Load_cbc_dec_key: + vperm 24,31,30,10 + lvx 30,8,6 + addi 6,6,0x20 + stvx 24,0,11 + vperm 25,30,31,10 + lvx 31,0,6 + stvx 25,8,11 + addi 11,11,0x20 + bdnz .Load_cbc_dec_key + + lvx 26,8,6 + vperm 24,31,30,10 + lvx 27,26,6 + stvx 24,0,11 + vperm 25,26,31,10 + lvx 28,27,6 + stvx 25,8,11 + addi 11,1,79 + vperm 26,27,26,10 + lvx 29,28,6 + vperm 27,28,27,10 + lvx 30,29,6 + vperm 28,29,28,10 + lvx 31,30,6 + vperm 29,30,29,10 + lvx 14,31,6 + vperm 30,31,30,10 + lvx 24,0,11 + vperm 31,14,31,10 + lvx 25,8,11 + + + + subi 3,3,15 + + li 10,8 + .long 0x7C001E99 + lvsl 6,0,10 + vspltisb 3,0x0f + .long 0x7C281E99 + vxor 6,6,3 + .long 0x7C5A1E99 + vperm 0,0,0,6 + .long 0x7C7B1E99 + vperm 1,1,1,6 + .long 0x7D5C1E99 + vperm 2,2,2,6 + vxor 14,0,23 + .long 0x7D7D1E99 + vperm 3,3,3,6 + vxor 15,1,23 + .long 0x7D9E1E99 + vperm 10,10,10,6 + vxor 16,2,23 + .long 0x7DBF1E99 + addi 3,3,0x80 + vperm 11,11,11,6 + vxor 17,3,23 + vperm 12,12,12,6 + vxor 18,10,23 + vperm 13,13,13,6 + vxor 19,11,23 + vxor 20,12,23 + vxor 21,13,23 + + mtctr 9 + b .Loop_cbc_dec8x +.align 5 +.Loop_cbc_dec8x: + .long 0x11CEC548 + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + lvx 24,26,11 + addi 11,11,0x20 + + .long 0x11CECD48 + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + lvx 25,8,11 + bdnz .Loop_cbc_dec8x + + subic 5,5,128 + .long 0x11CEC548 + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + + subfe. 0,0,0 + .long 0x11CECD48 + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + + and 0,0,5 + .long 0x11CED548 + .long 0x11EFD548 + .long 0x1210D548 + .long 0x1231D548 + .long 0x1252D548 + .long 0x1273D548 + .long 0x1294D548 + .long 0x12B5D548 + + add 3,3,0 + + + + .long 0x11CEDD48 + .long 0x11EFDD48 + .long 0x1210DD48 + .long 0x1231DD48 + .long 0x1252DD48 + .long 0x1273DD48 + .long 0x1294DD48 + .long 0x12B5DD48 + + addi 11,1,79 + .long 0x11CEE548 + .long 0x11EFE548 + .long 0x1210E548 + .long 0x1231E548 + .long 0x1252E548 + .long 0x1273E548 + .long 0x1294E548 + .long 0x12B5E548 + lvx 24,0,11 + + .long 0x11CEED48 + .long 0x11EFED48 + .long 0x1210ED48 + .long 0x1231ED48 + .long 0x1252ED48 + .long 0x1273ED48 + .long 0x1294ED48 + .long 0x12B5ED48 + lvx 25,8,11 + + .long 0x11CEF548 + vxor 4,4,31 + .long 0x11EFF548 + vxor 0,0,31 + .long 0x1210F548 + vxor 1,1,31 + .long 0x1231F548 + vxor 2,2,31 + .long 0x1252F548 + vxor 3,3,31 + .long 0x1273F548 + vxor 10,10,31 + .long 0x1294F548 + vxor 11,11,31 + .long 0x12B5F548 + vxor 12,12,31 + + .long 0x11CE2549 + .long 0x11EF0549 + .long 0x7C001E99 + .long 0x12100D49 + .long 0x7C281E99 + .long 0x12311549 + vperm 0,0,0,6 + .long 0x7C5A1E99 + .long 0x12521D49 + vperm 1,1,1,6 + .long 0x7C7B1E99 + .long 0x12735549 + vperm 2,2,2,6 + .long 0x7D5C1E99 + .long 0x12945D49 + vperm 3,3,3,6 + .long 0x7D7D1E99 + .long 0x12B56549 + vperm 10,10,10,6 + .long 0x7D9E1E99 + vor 4,13,13 + vperm 11,11,11,6 + .long 0x7DBF1E99 + addi 3,3,0x80 + + vperm 14,14,14,6 + vperm 15,15,15,6 + .long 0x7DC02799 + vperm 12,12,12,6 + vxor 14,0,23 + vperm 16,16,16,6 + .long 0x7DE82799 + vperm 13,13,13,6 + vxor 15,1,23 + vperm 17,17,17,6 + .long 0x7E1A2799 + vxor 16,2,23 + vperm 18,18,18,6 + .long 0x7E3B2799 + vxor 17,3,23 + vperm 19,19,19,6 + .long 0x7E5C2799 + vxor 18,10,23 + vperm 20,20,20,6 + .long 0x7E7D2799 + vxor 19,11,23 + vperm 21,21,21,6 + .long 0x7E9E2799 + vxor 20,12,23 + .long 0x7EBF2799 + addi 4,4,0x80 + vxor 21,13,23 + + mtctr 9 + beq .Loop_cbc_dec8x + + addic. 5,5,128 + beq .Lcbc_dec8x_done + nop + nop + +.Loop_cbc_dec8x_tail: + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + lvx 24,26,11 + addi 11,11,0x20 + + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + lvx 25,8,11 + bdnz .Loop_cbc_dec8x_tail + + .long 0x11EFC548 + .long 0x1210C548 + .long 0x1231C548 + .long 0x1252C548 + .long 0x1273C548 + .long 0x1294C548 + .long 0x12B5C548 + + .long 0x11EFCD48 + .long 0x1210CD48 + .long 0x1231CD48 + .long 0x1252CD48 + .long 0x1273CD48 + .long 0x1294CD48 + .long 0x12B5CD48 + + .long 0x11EFD548 + .long 0x1210D548 + .long 0x1231D548 + .long 0x1252D548 + .long 0x1273D548 + .long 0x1294D548 + .long 0x12B5D548 + + .long 0x11EFDD48 + .long 0x1210DD48 + .long 0x1231DD48 + .long 0x1252DD48 + .long 0x1273DD48 + .long 0x1294DD48 + .long 0x12B5DD48 + + .long 0x11EFE548 + .long 0x1210E548 + .long 0x1231E548 + .long 0x1252E548 + .long 0x1273E548 + .long 0x1294E548 + .long 0x12B5E548 + + .long 0x11EFED48 + .long 0x1210ED48 + .long 0x1231ED48 + .long 0x1252ED48 + .long 0x1273ED48 + .long 0x1294ED48 + .long 0x12B5ED48 + + .long 0x11EFF548 + vxor 4,4,31 + .long 0x1210F548 + vxor 1,1,31 + .long 0x1231F548 + vxor 2,2,31 + .long 0x1252F548 + vxor 3,3,31 + .long 0x1273F548 + vxor 10,10,31 + .long 0x1294F548 + vxor 11,11,31 + .long 0x12B5F548 + vxor 12,12,31 + + cmplwi 5,32 + blt .Lcbc_dec8x_one + nop + beq .Lcbc_dec8x_two + cmplwi 5,64 + blt .Lcbc_dec8x_three + nop + beq .Lcbc_dec8x_four + cmplwi 5,96 + blt .Lcbc_dec8x_five + nop + beq .Lcbc_dec8x_six + +.Lcbc_dec8x_seven: + .long 0x11EF2549 + .long 0x12100D49 + .long 0x12311549 + .long 0x12521D49 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + vperm 21,21,21,6 + .long 0x7E9D2799 + .long 0x7EBE2799 + addi 4,4,0x70 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_six: + .long 0x12102549 + .long 0x12311549 + .long 0x12521D49 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 16,16,16,6 + vperm 17,17,17,6 + .long 0x7E002799 + vperm 18,18,18,6 + .long 0x7E282799 + vperm 19,19,19,6 + .long 0x7E5A2799 + vperm 20,20,20,6 + .long 0x7E7B2799 + vperm 21,21,21,6 + .long 0x7E9C2799 + .long 0x7EBD2799 + addi 4,4,0x60 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_five: + .long 0x12312549 + .long 0x12521D49 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 17,17,17,6 + vperm 18,18,18,6 + .long 0x7E202799 + vperm 19,19,19,6 + .long 0x7E482799 + vperm 20,20,20,6 + .long 0x7E7A2799 + vperm 21,21,21,6 + .long 0x7E9B2799 + .long 0x7EBC2799 + addi 4,4,0x50 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_four: + .long 0x12522549 + .long 0x12735549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 18,18,18,6 + vperm 19,19,19,6 + .long 0x7E402799 + vperm 20,20,20,6 + .long 0x7E682799 + vperm 21,21,21,6 + .long 0x7E9A2799 + .long 0x7EBB2799 + addi 4,4,0x40 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_three: + .long 0x12732549 + .long 0x12945D49 + .long 0x12B56549 + vor 4,13,13 + + vperm 19,19,19,6 + vperm 20,20,20,6 + .long 0x7E602799 + vperm 21,21,21,6 + .long 0x7E882799 + .long 0x7EBA2799 + addi 4,4,0x30 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_two: + .long 0x12942549 + .long 0x12B56549 + vor 4,13,13 + + vperm 20,20,20,6 + vperm 21,21,21,6 + .long 0x7E802799 + .long 0x7EA82799 + addi 4,4,0x20 + b .Lcbc_dec8x_done + +.align 5 +.Lcbc_dec8x_one: + .long 0x12B52549 + vor 4,13,13 + + vperm 21,21,21,6 + .long 0x7EA02799 + addi 4,4,0x10 + +.Lcbc_dec8x_done: + vperm 4,4,4,6 + .long 0x7C803F99 + + li 10,79 + li 11,95 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function +.align 5 +aes_hw_ctr32_encrypt_blocks: +.localentry aes_hw_ctr32_encrypt_blocks,0 + + cmpldi 5,1 + .long 0x4dc00020 + + lis 0,0xfff0 + li 12,-1 + or 0,0,0 + + li 10,15 + vxor 0,0,0 + vspltisb 3,0x0f + + lvx 4,0,7 + lvsl 6,0,7 + lvx 5,10,7 + vspltisb 11,1 + vxor 6,6,3 + vperm 4,4,5,6 + vsldoi 11,0,11,1 + + neg 11,3 + lvsr 10,0,6 + lwz 9,240(6) + + lvsr 6,0,11 + lvx 5,0,3 + addi 3,3,15 + vxor 6,6,3 + + srwi 9,9,1 + li 10,16 + subi 9,9,1 + + cmpldi 5,8 + bge _aesp8_ctr32_encrypt8x + + lvsl 8,0,4 + vspltisb 9,-1 + lvx 7,0,4 + vperm 9,9,0,8 + vxor 8,8,3 + + lvx 0,0,6 + mtctr 9 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + vxor 2,4,0 + lvx 0,10,6 + addi 10,10,16 + b .Loop_ctr32_enc + +.align 5 +.Loop_ctr32_enc: + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + addi 10,10,16 + vperm 0,1,0,10 + .long 0x10420508 + lvx 0,10,6 + addi 10,10,16 + bdnz .Loop_ctr32_enc + + vadduwm 4,4,11 + vor 3,5,5 + lvx 5,0,3 + addi 3,3,16 + subic. 5,5,1 + + vperm 1,0,1,10 + .long 0x10420D08 + lvx 1,10,6 + vperm 3,3,5,6 + li 10,16 + vperm 1,1,0,10 + lvx 0,0,6 + vxor 3,3,1 + .long 0x10421D09 + + lvx 1,10,6 + addi 10,10,16 + vperm 2,2,2,8 + vsel 3,7,2,9 + mtctr 9 + vperm 0,1,0,10 + vor 7,2,2 + vxor 2,4,0 + lvx 0,10,6 + addi 10,10,16 + stvx 3,0,4 + addi 4,4,16 + bne .Loop_ctr32_enc + + addi 4,4,-1 + lvx 2,0,4 + vsel 2,7,2,9 + stvx 2,0,4 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,6,0 +.long 0 +.align 5 +_aesp8_ctr32_encrypt8x: + stdu 1,-448(1) + li 10,207 + li 11,223 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,11,1 + addi 11,11,32 + stvx 24,10,1 + addi 10,10,32 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 0,-1 + stw 12,396(1) + li 8,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + + lvx 23,0,6 + lvx 30,8,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,10 + addi 11,1,79 + mtctr 9 + +.Load_ctr32_enc_key: + vperm 24,31,30,10 + lvx 30,8,6 + addi 6,6,0x20 + stvx 24,0,11 + vperm 25,30,31,10 + lvx 31,0,6 + stvx 25,8,11 + addi 11,11,0x20 + bdnz .Load_ctr32_enc_key + + lvx 26,8,6 + vperm 24,31,30,10 + lvx 27,26,6 + stvx 24,0,11 + vperm 25,26,31,10 + lvx 28,27,6 + stvx 25,8,11 + addi 11,1,79 + vperm 26,27,26,10 + lvx 29,28,6 + vperm 27,28,27,10 + lvx 30,29,6 + vperm 28,29,28,10 + lvx 31,30,6 + vperm 29,30,29,10 + lvx 15,31,6 + vperm 30,31,30,10 + lvx 24,0,11 + vperm 31,15,31,10 + lvx 25,8,11 + + vadduwm 7,11,11 + subi 3,3,15 + sldi 5,5,4 + + vadduwm 16,4,11 + vadduwm 17,4,7 + vxor 15,4,23 + li 10,8 + vadduwm 18,16,7 + vxor 16,16,23 + lvsl 6,0,10 + vadduwm 19,17,7 + vxor 17,17,23 + vspltisb 3,0x0f + vadduwm 20,18,7 + vxor 18,18,23 + vxor 6,6,3 + vadduwm 21,19,7 + vxor 19,19,23 + vadduwm 22,20,7 + vxor 20,20,23 + vadduwm 4,21,7 + vxor 21,21,23 + vxor 22,22,23 + + mtctr 9 + b .Loop_ctr32_enc8x +.align 5 +.Loop_ctr32_enc8x: + .long 0x11EFC508 + .long 0x1210C508 + .long 0x1231C508 + .long 0x1252C508 + .long 0x1273C508 + .long 0x1294C508 + .long 0x12B5C508 + .long 0x12D6C508 +.Loop_ctr32_enc8x_middle: + lvx 24,26,11 + addi 11,11,0x20 + + .long 0x11EFCD08 + .long 0x1210CD08 + .long 0x1231CD08 + .long 0x1252CD08 + .long 0x1273CD08 + .long 0x1294CD08 + .long 0x12B5CD08 + .long 0x12D6CD08 + lvx 25,8,11 + bdnz .Loop_ctr32_enc8x + + subic 11,5,256 + .long 0x11EFC508 + .long 0x1210C508 + .long 0x1231C508 + .long 0x1252C508 + .long 0x1273C508 + .long 0x1294C508 + .long 0x12B5C508 + .long 0x12D6C508 + + subfe 0,0,0 + .long 0x11EFCD08 + .long 0x1210CD08 + .long 0x1231CD08 + .long 0x1252CD08 + .long 0x1273CD08 + .long 0x1294CD08 + .long 0x12B5CD08 + .long 0x12D6CD08 + + and 0,0,11 + addi 11,1,79 + .long 0x11EFD508 + .long 0x1210D508 + .long 0x1231D508 + .long 0x1252D508 + .long 0x1273D508 + .long 0x1294D508 + .long 0x12B5D508 + .long 0x12D6D508 + lvx 24,0,11 + + subic 5,5,129 + .long 0x11EFDD08 + addi 5,5,1 + .long 0x1210DD08 + .long 0x1231DD08 + .long 0x1252DD08 + .long 0x1273DD08 + .long 0x1294DD08 + .long 0x12B5DD08 + .long 0x12D6DD08 + lvx 25,8,11 + + .long 0x11EFE508 + .long 0x7C001E99 + .long 0x1210E508 + .long 0x7C281E99 + .long 0x1231E508 + .long 0x7C5A1E99 + .long 0x1252E508 + .long 0x7C7B1E99 + .long 0x1273E508 + .long 0x7D5C1E99 + .long 0x1294E508 + .long 0x7D9D1E99 + .long 0x12B5E508 + .long 0x7DBE1E99 + .long 0x12D6E508 + .long 0x7DDF1E99 + addi 3,3,0x80 + + .long 0x11EFED08 + vperm 0,0,0,6 + .long 0x1210ED08 + vperm 1,1,1,6 + .long 0x1231ED08 + vperm 2,2,2,6 + .long 0x1252ED08 + vperm 3,3,3,6 + .long 0x1273ED08 + vperm 10,10,10,6 + .long 0x1294ED08 + vperm 12,12,12,6 + .long 0x12B5ED08 + vperm 13,13,13,6 + .long 0x12D6ED08 + vperm 14,14,14,6 + + add 3,3,0 + + + + subfe. 0,0,0 + .long 0x11EFF508 + vxor 0,0,31 + .long 0x1210F508 + vxor 1,1,31 + .long 0x1231F508 + vxor 2,2,31 + .long 0x1252F508 + vxor 3,3,31 + .long 0x1273F508 + vxor 10,10,31 + .long 0x1294F508 + vxor 12,12,31 + .long 0x12B5F508 + vxor 13,13,31 + .long 0x12D6F508 + vxor 14,14,31 + + bne .Lctr32_enc8x_break + + .long 0x100F0509 + .long 0x10300D09 + vadduwm 16,4,11 + .long 0x10511509 + vadduwm 17,4,7 + vxor 15,4,23 + .long 0x10721D09 + vadduwm 18,16,7 + vxor 16,16,23 + .long 0x11535509 + vadduwm 19,17,7 + vxor 17,17,23 + .long 0x11946509 + vadduwm 20,18,7 + vxor 18,18,23 + .long 0x11B56D09 + vadduwm 21,19,7 + vxor 19,19,23 + .long 0x11D67509 + vadduwm 22,20,7 + vxor 20,20,23 + vperm 0,0,0,6 + vadduwm 4,21,7 + vxor 21,21,23 + vperm 1,1,1,6 + vxor 22,22,23 + mtctr 9 + + .long 0x11EFC508 + .long 0x7C002799 + vperm 2,2,2,6 + .long 0x1210C508 + .long 0x7C282799 + vperm 3,3,3,6 + .long 0x1231C508 + .long 0x7C5A2799 + vperm 10,10,10,6 + .long 0x1252C508 + .long 0x7C7B2799 + vperm 12,12,12,6 + .long 0x1273C508 + .long 0x7D5C2799 + vperm 13,13,13,6 + .long 0x1294C508 + .long 0x7D9D2799 + vperm 14,14,14,6 + .long 0x12B5C508 + .long 0x7DBE2799 + .long 0x12D6C508 + .long 0x7DDF2799 + addi 4,4,0x80 + + b .Loop_ctr32_enc8x_middle + +.align 5 +.Lctr32_enc8x_break: + cmpwi 5,-0x60 + blt .Lctr32_enc8x_one + nop + beq .Lctr32_enc8x_two + cmpwi 5,-0x40 + blt .Lctr32_enc8x_three + nop + beq .Lctr32_enc8x_four + cmpwi 5,-0x20 + blt .Lctr32_enc8x_five + nop + beq .Lctr32_enc8x_six + cmpwi 5,0x00 + blt .Lctr32_enc8x_seven + +.Lctr32_enc8x_eight: + .long 0x11EF0509 + .long 0x12100D09 + .long 0x12311509 + .long 0x12521D09 + .long 0x12735509 + .long 0x12946509 + .long 0x12B56D09 + .long 0x12D67509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + vperm 21,21,21,6 + .long 0x7E9D2799 + vperm 22,22,22,6 + .long 0x7EBE2799 + .long 0x7EDF2799 + addi 4,4,0x80 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_seven: + .long 0x11EF0D09 + .long 0x12101509 + .long 0x12311D09 + .long 0x12525509 + .long 0x12736509 + .long 0x12946D09 + .long 0x12B57509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + vperm 21,21,21,6 + .long 0x7E9D2799 + .long 0x7EBE2799 + addi 4,4,0x70 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_six: + .long 0x11EF1509 + .long 0x12101D09 + .long 0x12315509 + .long 0x12526509 + .long 0x12736D09 + .long 0x12947509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + vperm 20,20,20,6 + .long 0x7E7C2799 + .long 0x7E9D2799 + addi 4,4,0x60 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_five: + .long 0x11EF1D09 + .long 0x12105509 + .long 0x12316509 + .long 0x12526D09 + .long 0x12737509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + vperm 19,19,19,6 + .long 0x7E5B2799 + .long 0x7E7C2799 + addi 4,4,0x50 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_four: + .long 0x11EF5509 + .long 0x12106509 + .long 0x12316D09 + .long 0x12527509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + vperm 18,18,18,6 + .long 0x7E3A2799 + .long 0x7E5B2799 + addi 4,4,0x40 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_three: + .long 0x11EF6509 + .long 0x12106D09 + .long 0x12317509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + vperm 17,17,17,6 + .long 0x7E082799 + .long 0x7E3A2799 + addi 4,4,0x30 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_two: + .long 0x11EF6D09 + .long 0x12107509 + + vperm 15,15,15,6 + vperm 16,16,16,6 + .long 0x7DE02799 + .long 0x7E082799 + addi 4,4,0x20 + b .Lctr32_enc8x_done + +.align 5 +.Lctr32_enc8x_one: + .long 0x11EF7509 + + vperm 15,15,15,6 + .long 0x7DE02799 + addi 4,4,0x10 + +.Lctr32_enc8x_done: + li 10,79 + li 11,95 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + stvx 6,10,1 + addi 10,10,32 + stvx 6,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_xts_encrypt +.type aes_hw_xts_encrypt,@function +.align 5 +aes_hw_xts_encrypt: +.localentry aes_hw_xts_encrypt,0 + + mr 10,3 + li 3,-1 + cmpldi 5,16 + .long 0x4dc00020 + + lis 0,0xfff0 + li 12,-1 + li 11,0 + or 0,0,0 + + vspltisb 9,0x07 + lvsl 6,11,11 + vspltisb 11,0x0f + vxor 6,6,9 + + li 3,15 + lvx 8,0,8 + lvsl 5,0,8 + lvx 4,3,8 + vxor 5,5,11 + vperm 8,8,4,5 + + neg 11,10 + lvsr 5,0,11 + lvx 2,0,10 + addi 10,10,15 + vxor 5,5,11 + + cmpldi 7,0 + beq .Lxts_enc_no_key2 + + lvsr 7,0,7 + lwz 9,240(7) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + lvx 0,0,7 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + vxor 8,8,0 + lvx 0,3,7 + addi 3,3,16 + mtctr 9 + +.Ltweak_xts_enc: + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x11080508 + lvx 0,3,7 + addi 3,3,16 + bdnz .Ltweak_xts_enc + + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + vperm 0,1,0,7 + .long 0x11080509 + + li 8,0 + b .Lxts_enc + +.Lxts_enc_no_key2: + li 3,-16 + and 5,5,3 + + +.Lxts_enc: + lvx 4,0,10 + addi 10,10,16 + + lvsr 7,0,6 + lwz 9,240(6) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + vslb 10,9,9 + vor 10,10,9 + vspltisb 11,1 + vsldoi 10,10,11,15 + + cmpldi 5,96 + bge _aesp8_xts_encrypt6x + + andi. 7,5,15 + subic 0,5,32 + subi 7,7,16 + subfe 0,0,0 + and 0,0,7 + add 10,10,0 + + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + mtctr 9 + b .Loop_xts_enc + +.align 5 +.Loop_xts_enc: + vperm 1,0,1,7 + .long 0x10420D08 + lvx 1,3,6 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x10420508 + lvx 0,3,6 + addi 3,3,16 + bdnz .Loop_xts_enc + + vperm 1,0,1,7 + .long 0x10420D08 + lvx 1,3,6 + li 3,16 + vperm 0,1,0,7 + vxor 0,0,8 + .long 0x10620509 + + vperm 11,3,3,6 + + .long 0x7D602799 + + addi 4,4,16 + + subic. 5,5,16 + beq .Lxts_enc_done + + vor 2,4,4 + lvx 4,0,10 + addi 10,10,16 + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + + subic 0,5,32 + subfe 0,0,0 + and 0,0,7 + add 10,10,0 + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 3,3,0 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + + mtctr 9 + cmpldi 5,16 + bge .Loop_xts_enc + + vxor 3,3,8 + lvsr 5,0,5 + vxor 4,4,4 + vspltisb 11,-1 + vperm 4,4,11,5 + vsel 2,2,3,4 + + subi 11,4,17 + subi 4,4,16 + mtctr 5 + li 5,16 +.Loop_xts_enc_steal: + lbzu 0,1(11) + stb 0,16(11) + bdnz .Loop_xts_enc_steal + + mtctr 9 + b .Loop_xts_enc + +.Lxts_enc_done: + cmpldi 8,0 + beq .Lxts_enc_ret + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_enc_ret: + or 12,12,12 + li 3,0 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt + +.globl aes_hw_xts_decrypt +.type aes_hw_xts_decrypt,@function +.align 5 +aes_hw_xts_decrypt: +.localentry aes_hw_xts_decrypt,0 + + mr 10,3 + li 3,-1 + cmpldi 5,16 + .long 0x4dc00020 + + lis 0,0xfff8 + li 12,-1 + li 11,0 + or 0,0,0 + + andi. 0,5,15 + neg 0,0 + andi. 0,0,16 + sub 5,5,0 + + vspltisb 9,0x07 + lvsl 6,11,11 + vspltisb 11,0x0f + vxor 6,6,9 + + li 3,15 + lvx 8,0,8 + lvsl 5,0,8 + lvx 4,3,8 + vxor 5,5,11 + vperm 8,8,4,5 + + neg 11,10 + lvsr 5,0,11 + lvx 2,0,10 + addi 10,10,15 + vxor 5,5,11 + + cmpldi 7,0 + beq .Lxts_dec_no_key2 + + lvsr 7,0,7 + lwz 9,240(7) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + lvx 0,0,7 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + vxor 8,8,0 + lvx 0,3,7 + addi 3,3,16 + mtctr 9 + +.Ltweak_xts_dec: + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x11080508 + lvx 0,3,7 + addi 3,3,16 + bdnz .Ltweak_xts_dec + + vperm 1,0,1,7 + .long 0x11080D08 + lvx 1,3,7 + vperm 0,1,0,7 + .long 0x11080509 + + li 8,0 + b .Lxts_dec + +.Lxts_dec_no_key2: + neg 3,5 + andi. 3,3,15 + add 5,5,3 + + +.Lxts_dec: + lvx 4,0,10 + addi 10,10,16 + + lvsr 7,0,6 + lwz 9,240(6) + srwi 9,9,1 + subi 9,9,1 + li 3,16 + + vslb 10,9,9 + vor 10,10,9 + vspltisb 11,1 + vsldoi 10,10,11,15 + + cmpldi 5,96 + bge _aesp8_xts_decrypt6x + + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + mtctr 9 + + cmpldi 5,16 + blt .Ltail_xts_dec + + +.align 5 +.Loop_xts_dec: + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x10420548 + lvx 0,3,6 + addi 3,3,16 + bdnz .Loop_xts_dec + + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + li 3,16 + vperm 0,1,0,7 + vxor 0,0,8 + .long 0x10620549 + + vperm 11,3,3,6 + + .long 0x7D602799 + + addi 4,4,16 + + subic. 5,5,16 + beq .Lxts_dec_done + + vor 2,4,4 + lvx 4,0,10 + addi 10,10,16 + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 2,2,4,5 + vperm 0,1,0,7 + vxor 2,2,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + + mtctr 9 + cmpldi 5,16 + bge .Loop_xts_dec + +.Ltail_xts_dec: + vsrab 11,8,9 + vaddubm 12,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 12,12,11 + + subi 10,10,16 + add 10,10,5 + + vxor 2,2,8 + vxor 2,2,12 + +.Loop_xts_dec_short: + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + addi 3,3,16 + vperm 0,1,0,7 + .long 0x10420548 + lvx 0,3,6 + addi 3,3,16 + bdnz .Loop_xts_dec_short + + vperm 1,0,1,7 + .long 0x10420D48 + lvx 1,3,6 + li 3,16 + vperm 0,1,0,7 + vxor 0,0,12 + .long 0x10620549 + + vperm 11,3,3,6 + + .long 0x7D602799 + + + vor 2,4,4 + lvx 4,0,10 + + lvx 0,0,6 + lvx 1,3,6 + addi 3,3,16 + vperm 2,2,4,5 + vperm 0,1,0,7 + + lvsr 5,0,5 + vxor 4,4,4 + vspltisb 11,-1 + vperm 4,4,11,5 + vsel 2,2,3,4 + + vxor 0,0,8 + vxor 2,2,0 + lvx 0,3,6 + addi 3,3,16 + + subi 11,4,1 + mtctr 5 + li 5,16 +.Loop_xts_dec_steal: + lbzu 0,1(11) + stb 0,16(11) + bdnz .Loop_xts_dec_steal + + mtctr 9 + b .Loop_xts_dec + +.Lxts_dec_done: + cmpldi 8,0 + beq .Lxts_dec_ret + + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 8,8,11 + + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_dec_ret: + or 12,12,12 + li 3,0 + blr +.long 0 +.byte 0,12,0x04,0,0x80,6,6,0 +.long 0 +.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt +.align 5 +_aesp8_xts_encrypt6x: + stdu 1,-448(1) + mflr 11 + li 7,207 + li 3,223 + std 11,464(1) + stvx 20,7,1 + addi 7,7,32 + stvx 21,3,1 + addi 3,3,32 + stvx 22,7,1 + addi 7,7,32 + stvx 23,3,1 + addi 3,3,32 + stvx 24,7,1 + addi 7,7,32 + stvx 25,3,1 + addi 3,3,32 + stvx 26,7,1 + addi 7,7,32 + stvx 27,3,1 + addi 3,3,32 + stvx 28,7,1 + addi 7,7,32 + stvx 29,3,1 + addi 3,3,32 + stvx 30,7,1 + stvx 31,3,1 + li 0,-1 + stw 12,396(1) + li 3,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + + lvx 23,0,6 + lvx 30,3,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,7 + addi 7,1,79 + mtctr 9 + +.Load_xts_enc_key: + vperm 24,31,30,7 + lvx 30,3,6 + addi 6,6,0x20 + stvx 24,0,7 + vperm 25,30,31,7 + lvx 31,0,6 + stvx 25,3,7 + addi 7,7,0x20 + bdnz .Load_xts_enc_key + + lvx 26,3,6 + vperm 24,31,30,7 + lvx 27,26,6 + stvx 24,0,7 + vperm 25,26,31,7 + lvx 28,27,6 + stvx 25,3,7 + addi 7,1,79 + vperm 26,27,26,7 + lvx 29,28,6 + vperm 27,28,27,7 + lvx 30,29,6 + vperm 28,29,28,7 + lvx 31,30,6 + vperm 29,30,29,7 + lvx 22,31,6 + vperm 30,31,30,7 + lvx 24,0,7 + vperm 31,22,31,7 + lvx 25,3,7 + + vperm 0,2,4,5 + subi 10,10,31 + vxor 17,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 7,0,17 + vxor 8,8,11 + + .long 0x7C235699 + vxor 18,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 1,1,1,6 + vand 11,11,10 + vxor 12,1,18 + vxor 8,8,11 + + .long 0x7C5A5699 + andi. 31,5,15 + vxor 19,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 2,2,2,6 + vand 11,11,10 + vxor 13,2,19 + vxor 8,8,11 + + .long 0x7C7B5699 + sub 5,5,31 + vxor 20,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 3,3,3,6 + vand 11,11,10 + vxor 14,3,20 + vxor 8,8,11 + + .long 0x7C9C5699 + subi 5,5,0x60 + vxor 21,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 4,4,4,6 + vand 11,11,10 + vxor 15,4,21 + vxor 8,8,11 + + .long 0x7CBD5699 + addi 10,10,0x60 + vxor 22,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 5,5,5,6 + vand 11,11,10 + vxor 16,5,22 + vxor 8,8,11 + + vxor 31,31,23 + mtctr 9 + b .Loop_xts_enc6x + +.align 5 +.Loop_xts_enc6x: + .long 0x10E7C508 + .long 0x118CC508 + .long 0x11ADC508 + .long 0x11CEC508 + .long 0x11EFC508 + .long 0x1210C508 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD08 + .long 0x118CCD08 + .long 0x11ADCD08 + .long 0x11CECD08 + .long 0x11EFCD08 + .long 0x1210CD08 + lvx 25,3,7 + bdnz .Loop_xts_enc6x + + subic 5,5,96 + vxor 0,17,31 + .long 0x10E7C508 + .long 0x118CC508 + vsrab 11,8,9 + vxor 17,8,23 + vaddubm 8,8,8 + .long 0x11ADC508 + .long 0x11CEC508 + vsldoi 11,11,11,15 + .long 0x11EFC508 + .long 0x1210C508 + + subfe. 0,0,0 + vand 11,11,10 + .long 0x10E7CD08 + .long 0x118CCD08 + vxor 8,8,11 + .long 0x11ADCD08 + .long 0x11CECD08 + vxor 1,18,31 + vsrab 11,8,9 + vxor 18,8,23 + .long 0x11EFCD08 + .long 0x1210CD08 + + and 0,0,5 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x10E7D508 + .long 0x118CD508 + vand 11,11,10 + .long 0x11ADD508 + .long 0x11CED508 + vxor 8,8,11 + .long 0x11EFD508 + .long 0x1210D508 + + add 10,10,0 + + + + vxor 2,19,31 + vsrab 11,8,9 + vxor 19,8,23 + vaddubm 8,8,8 + .long 0x10E7DD08 + .long 0x118CDD08 + vsldoi 11,11,11,15 + .long 0x11ADDD08 + .long 0x11CEDD08 + vand 11,11,10 + .long 0x11EFDD08 + .long 0x1210DD08 + + addi 7,1,79 + vxor 8,8,11 + .long 0x10E7E508 + .long 0x118CE508 + vxor 3,20,31 + vsrab 11,8,9 + vxor 20,8,23 + .long 0x11ADE508 + .long 0x11CEE508 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x11EFE508 + .long 0x1210E508 + lvx 24,0,7 + vand 11,11,10 + + .long 0x10E7ED08 + .long 0x118CED08 + vxor 8,8,11 + .long 0x11ADED08 + .long 0x11CEED08 + vxor 4,21,31 + vsrab 11,8,9 + vxor 21,8,23 + .long 0x11EFED08 + .long 0x1210ED08 + lvx 25,3,7 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + + .long 0x10E7F508 + .long 0x118CF508 + vand 11,11,10 + .long 0x11ADF508 + .long 0x11CEF508 + vxor 8,8,11 + .long 0x11EFF508 + .long 0x1210F508 + vxor 5,22,31 + vsrab 11,8,9 + vxor 22,8,23 + + .long 0x10E70509 + .long 0x7C005699 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x118C0D09 + .long 0x7C235699 + .long 0x11AD1509 + vperm 0,0,0,6 + .long 0x7C5A5699 + vand 11,11,10 + .long 0x11CE1D09 + vperm 1,1,1,6 + .long 0x7C7B5699 + .long 0x11EF2509 + vperm 2,2,2,6 + .long 0x7C9C5699 + vxor 8,8,11 + .long 0x11702D09 + + vperm 3,3,3,6 + .long 0x7CBD5699 + addi 10,10,0x60 + vperm 4,4,4,6 + vperm 5,5,5,6 + + vperm 7,7,7,6 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,17 + vperm 13,13,13,6 + .long 0x7D832799 + vxor 12,1,18 + vperm 14,14,14,6 + .long 0x7DBA2799 + vxor 13,2,19 + vperm 15,15,15,6 + .long 0x7DDB2799 + vxor 14,3,20 + vperm 16,11,11,6 + .long 0x7DFC2799 + vxor 15,4,21 + .long 0x7E1D2799 + + vxor 16,5,22 + addi 4,4,0x60 + + mtctr 9 + beq .Loop_xts_enc6x + + addic. 5,5,0x60 + beq .Lxts_enc6x_zero + cmpwi 5,0x20 + blt .Lxts_enc6x_one + nop + beq .Lxts_enc6x_two + cmpwi 5,0x40 + blt .Lxts_enc6x_three + nop + beq .Lxts_enc6x_four + +.Lxts_enc6x_five: + vxor 7,1,17 + vxor 12,2,18 + vxor 13,3,19 + vxor 14,4,20 + vxor 15,5,21 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,22,22 + vperm 12,12,12,6 + .long 0x7CE02799 + vperm 13,13,13,6 + .long 0x7D832799 + vperm 14,14,14,6 + .long 0x7DBA2799 + vxor 11,15,22 + vperm 15,15,15,6 + .long 0x7DDB2799 + .long 0x7DFC2799 + addi 4,4,0x50 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_four: + vxor 7,2,17 + vxor 12,3,18 + vxor 13,4,19 + vxor 14,5,20 + vxor 15,15,15 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,21,21 + vperm 12,12,12,6 + .long 0x7CE02799 + vperm 13,13,13,6 + .long 0x7D832799 + vxor 11,14,21 + vperm 14,14,14,6 + .long 0x7DBA2799 + .long 0x7DDB2799 + addi 4,4,0x40 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_three: + vxor 7,3,17 + vxor 12,4,18 + vxor 13,5,19 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,20,20 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 11,13,20 + vperm 13,13,13,6 + .long 0x7D832799 + .long 0x7DBA2799 + addi 4,4,0x30 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_two: + vxor 7,4,17 + vxor 12,5,18 + vxor 13,13,13 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_enc5x + + vperm 7,7,7,6 + vor 17,19,19 + vxor 11,12,19 + vperm 12,12,12,6 + .long 0x7CE02799 + .long 0x7D832799 + addi 4,4,0x20 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_one: + vxor 7,5,17 + nop +.Loop_xts_enc1x: + .long 0x10E7C508 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD08 + lvx 25,3,7 + bdnz .Loop_xts_enc1x + + add 10,10,31 + cmpwi 31,0 + .long 0x10E7C508 + + subi 10,10,16 + .long 0x10E7CD08 + + lvsr 5,0,31 + .long 0x10E7D508 + + .long 0x7C005699 + .long 0x10E7DD08 + + addi 7,1,79 + .long 0x10E7E508 + lvx 24,0,7 + + .long 0x10E7ED08 + lvx 25,3,7 + vxor 17,17,31 + + vperm 0,0,0,6 + .long 0x10E7F508 + + vperm 0,0,0,5 + .long 0x10E78D09 + + vor 17,18,18 + vxor 11,7,18 + vperm 7,7,7,6 + .long 0x7CE02799 + addi 4,4,0x10 + bne .Lxts_enc6x_steal + b .Lxts_enc6x_done + +.align 4 +.Lxts_enc6x_zero: + cmpwi 31,0 + beq .Lxts_enc6x_done + + add 10,10,31 + subi 10,10,16 + .long 0x7C005699 + lvsr 5,0,31 + vperm 0,0,0,6 + vperm 0,0,0,5 + vxor 11,11,17 +.Lxts_enc6x_steal: + vxor 0,0,17 + vxor 7,7,7 + vspltisb 12,-1 + vperm 7,7,12,5 + vsel 7,0,11,7 + + subi 30,4,17 + subi 4,4,16 + mtctr 31 +.Loop_xts_enc6x_steal: + lbzu 0,1(30) + stb 0,16(30) + bdnz .Loop_xts_enc6x_steal + + li 31,0 + mtctr 9 + b .Loop_xts_enc1x + +.align 4 +.Lxts_enc6x_done: + cmpldi 8,0 + beq .Lxts_enc6x_ret + + vxor 8,17,23 + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_enc6x_ret: + mtlr 11 + li 10,79 + li 11,95 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,1,0x80,6,6,0 +.long 0 + +.align 5 +_aesp8_xts_enc5x: + .long 0x10E7C508 + .long 0x118CC508 + .long 0x11ADC508 + .long 0x11CEC508 + .long 0x11EFC508 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD08 + .long 0x118CCD08 + .long 0x11ADCD08 + .long 0x11CECD08 + .long 0x11EFCD08 + lvx 25,3,7 + bdnz _aesp8_xts_enc5x + + add 10,10,31 + cmpwi 31,0 + .long 0x10E7C508 + .long 0x118CC508 + .long 0x11ADC508 + .long 0x11CEC508 + .long 0x11EFC508 + + subi 10,10,16 + .long 0x10E7CD08 + .long 0x118CCD08 + .long 0x11ADCD08 + .long 0x11CECD08 + .long 0x11EFCD08 + vxor 17,17,31 + + .long 0x10E7D508 + lvsr 5,0,31 + .long 0x118CD508 + .long 0x11ADD508 + .long 0x11CED508 + .long 0x11EFD508 + vxor 1,18,31 + + .long 0x10E7DD08 + .long 0x7C005699 + .long 0x118CDD08 + .long 0x11ADDD08 + .long 0x11CEDD08 + .long 0x11EFDD08 + vxor 2,19,31 + + addi 7,1,79 + .long 0x10E7E508 + .long 0x118CE508 + .long 0x11ADE508 + .long 0x11CEE508 + .long 0x11EFE508 + lvx 24,0,7 + vxor 3,20,31 + + .long 0x10E7ED08 + vperm 0,0,0,6 + .long 0x118CED08 + .long 0x11ADED08 + .long 0x11CEED08 + .long 0x11EFED08 + lvx 25,3,7 + vxor 4,21,31 + + .long 0x10E7F508 + vperm 0,0,0,5 + .long 0x118CF508 + .long 0x11ADF508 + .long 0x11CEF508 + .long 0x11EFF508 + + .long 0x10E78D09 + .long 0x118C0D09 + .long 0x11AD1509 + .long 0x11CE1D09 + .long 0x11EF2509 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 + +.align 5 +_aesp8_xts_decrypt6x: + stdu 1,-448(1) + mflr 11 + li 7,207 + li 3,223 + std 11,464(1) + stvx 20,7,1 + addi 7,7,32 + stvx 21,3,1 + addi 3,3,32 + stvx 22,7,1 + addi 7,7,32 + stvx 23,3,1 + addi 3,3,32 + stvx 24,7,1 + addi 7,7,32 + stvx 25,3,1 + addi 3,3,32 + stvx 26,7,1 + addi 7,7,32 + stvx 27,3,1 + addi 3,3,32 + stvx 28,7,1 + addi 7,7,32 + stvx 29,3,1 + addi 3,3,32 + stvx 30,7,1 + stvx 31,3,1 + li 0,-1 + stw 12,396(1) + li 3,0x10 + std 26,400(1) + li 26,0x20 + std 27,408(1) + li 27,0x30 + std 28,416(1) + li 28,0x40 + std 29,424(1) + li 29,0x50 + std 30,432(1) + li 30,0x60 + std 31,440(1) + li 31,0x70 + or 0,0,0 + + subi 9,9,3 + + lvx 23,0,6 + lvx 30,3,6 + addi 6,6,0x20 + lvx 31,0,6 + vperm 23,30,23,7 + addi 7,1,79 + mtctr 9 + +.Load_xts_dec_key: + vperm 24,31,30,7 + lvx 30,3,6 + addi 6,6,0x20 + stvx 24,0,7 + vperm 25,30,31,7 + lvx 31,0,6 + stvx 25,3,7 + addi 7,7,0x20 + bdnz .Load_xts_dec_key + + lvx 26,3,6 + vperm 24,31,30,7 + lvx 27,26,6 + stvx 24,0,7 + vperm 25,26,31,7 + lvx 28,27,6 + stvx 25,3,7 + addi 7,1,79 + vperm 26,27,26,7 + lvx 29,28,6 + vperm 27,28,27,7 + lvx 30,29,6 + vperm 28,29,28,7 + lvx 31,30,6 + vperm 29,30,29,7 + lvx 22,31,6 + vperm 30,31,30,7 + lvx 24,0,7 + vperm 31,22,31,7 + lvx 25,3,7 + + vperm 0,2,4,5 + subi 10,10,31 + vxor 17,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vand 11,11,10 + vxor 7,0,17 + vxor 8,8,11 + + .long 0x7C235699 + vxor 18,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 1,1,1,6 + vand 11,11,10 + vxor 12,1,18 + vxor 8,8,11 + + .long 0x7C5A5699 + andi. 31,5,15 + vxor 19,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 2,2,2,6 + vand 11,11,10 + vxor 13,2,19 + vxor 8,8,11 + + .long 0x7C7B5699 + sub 5,5,31 + vxor 20,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 3,3,3,6 + vand 11,11,10 + vxor 14,3,20 + vxor 8,8,11 + + .long 0x7C9C5699 + subi 5,5,0x60 + vxor 21,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 4,4,4,6 + vand 11,11,10 + vxor 15,4,21 + vxor 8,8,11 + + .long 0x7CBD5699 + addi 10,10,0x60 + vxor 22,8,23 + vsrab 11,8,9 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + vperm 5,5,5,6 + vand 11,11,10 + vxor 16,5,22 + vxor 8,8,11 + + vxor 31,31,23 + mtctr 9 + b .Loop_xts_dec6x + +.align 5 +.Loop_xts_dec6x: + .long 0x10E7C548 + .long 0x118CC548 + .long 0x11ADC548 + .long 0x11CEC548 + .long 0x11EFC548 + .long 0x1210C548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + .long 0x118CCD48 + .long 0x11ADCD48 + .long 0x11CECD48 + .long 0x11EFCD48 + .long 0x1210CD48 + lvx 25,3,7 + bdnz .Loop_xts_dec6x + + subic 5,5,96 + vxor 0,17,31 + .long 0x10E7C548 + .long 0x118CC548 + vsrab 11,8,9 + vxor 17,8,23 + vaddubm 8,8,8 + .long 0x11ADC548 + .long 0x11CEC548 + vsldoi 11,11,11,15 + .long 0x11EFC548 + .long 0x1210C548 + + subfe. 0,0,0 + vand 11,11,10 + .long 0x10E7CD48 + .long 0x118CCD48 + vxor 8,8,11 + .long 0x11ADCD48 + .long 0x11CECD48 + vxor 1,18,31 + vsrab 11,8,9 + vxor 18,8,23 + .long 0x11EFCD48 + .long 0x1210CD48 + + and 0,0,5 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x10E7D548 + .long 0x118CD548 + vand 11,11,10 + .long 0x11ADD548 + .long 0x11CED548 + vxor 8,8,11 + .long 0x11EFD548 + .long 0x1210D548 + + add 10,10,0 + + + + vxor 2,19,31 + vsrab 11,8,9 + vxor 19,8,23 + vaddubm 8,8,8 + .long 0x10E7DD48 + .long 0x118CDD48 + vsldoi 11,11,11,15 + .long 0x11ADDD48 + .long 0x11CEDD48 + vand 11,11,10 + .long 0x11EFDD48 + .long 0x1210DD48 + + addi 7,1,79 + vxor 8,8,11 + .long 0x10E7E548 + .long 0x118CE548 + vxor 3,20,31 + vsrab 11,8,9 + vxor 20,8,23 + .long 0x11ADE548 + .long 0x11CEE548 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x11EFE548 + .long 0x1210E548 + lvx 24,0,7 + vand 11,11,10 + + .long 0x10E7ED48 + .long 0x118CED48 + vxor 8,8,11 + .long 0x11ADED48 + .long 0x11CEED48 + vxor 4,21,31 + vsrab 11,8,9 + vxor 21,8,23 + .long 0x11EFED48 + .long 0x1210ED48 + lvx 25,3,7 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + + .long 0x10E7F548 + .long 0x118CF548 + vand 11,11,10 + .long 0x11ADF548 + .long 0x11CEF548 + vxor 8,8,11 + .long 0x11EFF548 + .long 0x1210F548 + vxor 5,22,31 + vsrab 11,8,9 + vxor 22,8,23 + + .long 0x10E70549 + .long 0x7C005699 + vaddubm 8,8,8 + vsldoi 11,11,11,15 + .long 0x118C0D49 + .long 0x7C235699 + .long 0x11AD1549 + vperm 0,0,0,6 + .long 0x7C5A5699 + vand 11,11,10 + .long 0x11CE1D49 + vperm 1,1,1,6 + .long 0x7C7B5699 + .long 0x11EF2549 + vperm 2,2,2,6 + .long 0x7C9C5699 + vxor 8,8,11 + .long 0x12102D49 + vperm 3,3,3,6 + .long 0x7CBD5699 + addi 10,10,0x60 + vperm 4,4,4,6 + vperm 5,5,5,6 + + vperm 7,7,7,6 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,17 + vperm 13,13,13,6 + .long 0x7D832799 + vxor 12,1,18 + vperm 14,14,14,6 + .long 0x7DBA2799 + vxor 13,2,19 + vperm 15,15,15,6 + .long 0x7DDB2799 + vxor 14,3,20 + vperm 16,16,16,6 + .long 0x7DFC2799 + vxor 15,4,21 + .long 0x7E1D2799 + vxor 16,5,22 + addi 4,4,0x60 + + mtctr 9 + beq .Loop_xts_dec6x + + addic. 5,5,0x60 + beq .Lxts_dec6x_zero + cmpwi 5,0x20 + blt .Lxts_dec6x_one + nop + beq .Lxts_dec6x_two + cmpwi 5,0x40 + blt .Lxts_dec6x_three + nop + beq .Lxts_dec6x_four + +.Lxts_dec6x_five: + vxor 7,1,17 + vxor 12,2,18 + vxor 13,3,19 + vxor 14,4,20 + vxor 15,5,21 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,22,22 + vxor 18,8,23 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,18 + vperm 13,13,13,6 + .long 0x7D832799 + vperm 14,14,14,6 + .long 0x7DBA2799 + vperm 15,15,15,6 + .long 0x7DDB2799 + .long 0x7DFC2799 + addi 4,4,0x50 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_four: + vxor 7,2,17 + vxor 12,3,18 + vxor 13,4,19 + vxor 14,5,20 + vxor 15,15,15 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,21,21 + vor 18,22,22 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,22 + vperm 13,13,13,6 + .long 0x7D832799 + vperm 14,14,14,6 + .long 0x7DBA2799 + .long 0x7DDB2799 + addi 4,4,0x40 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_three: + vxor 7,3,17 + vxor 12,4,18 + vxor 13,5,19 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,20,20 + vor 18,21,21 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,21 + vperm 13,13,13,6 + .long 0x7D832799 + .long 0x7DBA2799 + addi 4,4,0x30 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_two: + vxor 7,4,17 + vxor 12,5,18 + vxor 13,13,13 + vxor 14,14,14 + vxor 15,15,15 + + bl _aesp8_xts_dec5x + + vperm 7,7,7,6 + vor 17,19,19 + vor 18,20,20 + vperm 12,12,12,6 + .long 0x7CE02799 + vxor 7,0,20 + .long 0x7D832799 + addi 4,4,0x20 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_one: + vxor 7,5,17 + nop +.Loop_xts_dec1x: + .long 0x10E7C548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + lvx 25,3,7 + bdnz .Loop_xts_dec1x + + subi 0,31,1 + .long 0x10E7C548 + + andi. 0,0,16 + cmpwi 31,0 + .long 0x10E7CD48 + + sub 10,10,0 + .long 0x10E7D548 + + .long 0x7C005699 + .long 0x10E7DD48 + + addi 7,1,79 + .long 0x10E7E548 + lvx 24,0,7 + + .long 0x10E7ED48 + lvx 25,3,7 + vxor 17,17,31 + + vperm 0,0,0,6 + .long 0x10E7F548 + + mtctr 9 + .long 0x10E78D49 + + vor 17,18,18 + vor 18,19,19 + vperm 7,7,7,6 + .long 0x7CE02799 + addi 4,4,0x10 + vxor 7,0,19 + bne .Lxts_dec6x_steal + b .Lxts_dec6x_done + +.align 4 +.Lxts_dec6x_zero: + cmpwi 31,0 + beq .Lxts_dec6x_done + + .long 0x7C005699 + vperm 0,0,0,6 + vxor 7,0,18 +.Lxts_dec6x_steal: + .long 0x10E7C548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + lvx 25,3,7 + bdnz .Lxts_dec6x_steal + + add 10,10,31 + .long 0x10E7C548 + + cmpwi 31,0 + .long 0x10E7CD48 + + .long 0x7C005699 + .long 0x10E7D548 + + lvsr 5,0,31 + .long 0x10E7DD48 + + addi 7,1,79 + .long 0x10E7E548 + lvx 24,0,7 + + .long 0x10E7ED48 + lvx 25,3,7 + vxor 18,18,31 + + vperm 0,0,0,6 + .long 0x10E7F548 + + vperm 0,0,0,5 + .long 0x11679549 + + vperm 7,11,11,6 + .long 0x7CE02799 + + + vxor 7,7,7 + vspltisb 12,-1 + vperm 7,7,12,5 + vsel 7,0,11,7 + vxor 7,7,17 + + subi 30,4,1 + mtctr 31 +.Loop_xts_dec6x_steal: + lbzu 0,1(30) + stb 0,16(30) + bdnz .Loop_xts_dec6x_steal + + li 31,0 + mtctr 9 + b .Loop_xts_dec1x + +.align 4 +.Lxts_dec6x_done: + cmpldi 8,0 + beq .Lxts_dec6x_ret + + vxor 8,17,23 + vperm 8,8,8,6 + .long 0x7D004799 + +.Lxts_dec6x_ret: + mtlr 11 + li 10,79 + li 11,95 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + stvx 9,10,1 + addi 10,10,32 + stvx 9,11,1 + addi 11,11,32 + + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + ld 26,400(1) + ld 27,408(1) + ld 28,416(1) + ld 29,424(1) + ld 30,432(1) + ld 31,440(1) + addi 1,1,448 + blr +.long 0 +.byte 0,12,0x04,1,0x80,6,6,0 +.long 0 + +.align 5 +_aesp8_xts_dec5x: + .long 0x10E7C548 + .long 0x118CC548 + .long 0x11ADC548 + .long 0x11CEC548 + .long 0x11EFC548 + lvx 24,26,7 + addi 7,7,0x20 + + .long 0x10E7CD48 + .long 0x118CCD48 + .long 0x11ADCD48 + .long 0x11CECD48 + .long 0x11EFCD48 + lvx 25,3,7 + bdnz _aesp8_xts_dec5x + + subi 0,31,1 + .long 0x10E7C548 + .long 0x118CC548 + .long 0x11ADC548 + .long 0x11CEC548 + .long 0x11EFC548 + + andi. 0,0,16 + cmpwi 31,0 + .long 0x10E7CD48 + .long 0x118CCD48 + .long 0x11ADCD48 + .long 0x11CECD48 + .long 0x11EFCD48 + vxor 17,17,31 + + sub 10,10,0 + .long 0x10E7D548 + .long 0x118CD548 + .long 0x11ADD548 + .long 0x11CED548 + .long 0x11EFD548 + vxor 1,18,31 + + .long 0x10E7DD48 + .long 0x7C005699 + .long 0x118CDD48 + .long 0x11ADDD48 + .long 0x11CEDD48 + .long 0x11EFDD48 + vxor 2,19,31 + + addi 7,1,79 + .long 0x10E7E548 + .long 0x118CE548 + .long 0x11ADE548 + .long 0x11CEE548 + .long 0x11EFE548 + lvx 24,0,7 + vxor 3,20,31 + + .long 0x10E7ED48 + vperm 0,0,0,6 + .long 0x118CED48 + .long 0x11ADED48 + .long 0x11CEED48 + .long 0x11EFED48 + lvx 25,3,7 + vxor 4,21,31 + + .long 0x10E7F548 + .long 0x118CF548 + .long 0x11ADF548 + .long 0x11CEF548 + .long 0x11EFF548 + + .long 0x10E78D49 + .long 0x118C0D49 + .long 0x11AD1549 + .long 0x11CE1D49 + .long 0x11EF2549 + mtctr 9 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +#endif // !OPENSSL_NO_ASM && __powerpc64__ +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S new file mode 100644 index 0000000000..5b909a38d3 --- /dev/null +++ b/packager/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S @@ -0,0 +1,587 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) +.machine "any" + +.abiversion 2 +.text + +.globl gcm_init_p8 +.type gcm_init_p8,@function +.align 5 +gcm_init_p8: +.localentry gcm_init_p8,0 + + li 0,-4096 + li 8,0x10 + li 12,-1 + li 9,0x20 + or 0,0,0 + li 10,0x30 + .long 0x7D202699 + + vspltisb 8,-16 + vspltisb 5,1 + vaddubm 8,8,8 + vxor 4,4,4 + vor 8,8,5 + vsldoi 8,8,4,15 + vsldoi 6,4,5,1 + vaddubm 8,8,8 + vspltisb 7,7 + vor 8,8,6 + vspltb 6,9,0 + vsl 9,9,5 + vsrab 6,6,7 + vand 6,6,8 + vxor 3,9,6 + + vsldoi 9,3,3,8 + vsldoi 8,4,8,8 + vsldoi 11,4,9,8 + vsldoi 10,9,4,8 + + .long 0x7D001F99 + .long 0x7D681F99 + li 8,0x40 + .long 0x7D291F99 + li 9,0x50 + .long 0x7D4A1F99 + li 10,0x60 + + .long 0x10035CC8 + .long 0x10234CC8 + .long 0x104354C8 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + vxor 16,0,6 + + vsldoi 17,16,16,8 + vsldoi 19,4,17,8 + vsldoi 18,17,4,8 + + .long 0x7E681F99 + li 8,0x70 + .long 0x7E291F99 + li 9,0x80 + .long 0x7E4A1F99 + li 10,0x90 + .long 0x10039CC8 + .long 0x11B09CC8 + .long 0x10238CC8 + .long 0x11D08CC8 + .long 0x104394C8 + .long 0x11F094C8 + + .long 0x10E044C8 + .long 0x114D44C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vsldoi 11,14,4,8 + vsldoi 9,4,14,8 + vxor 0,0,5 + vxor 2,2,6 + vxor 13,13,11 + vxor 15,15,9 + + vsldoi 0,0,0,8 + vsldoi 13,13,13,8 + vxor 0,0,7 + vxor 13,13,10 + + vsldoi 6,0,0,8 + vsldoi 9,13,13,8 + .long 0x100044C8 + .long 0x11AD44C8 + vxor 6,6,2 + vxor 9,9,15 + vxor 0,0,6 + vxor 13,13,9 + + vsldoi 9,0,0,8 + vsldoi 17,13,13,8 + vsldoi 11,4,9,8 + vsldoi 10,9,4,8 + vsldoi 19,4,17,8 + vsldoi 18,17,4,8 + + .long 0x7D681F99 + li 8,0xa0 + .long 0x7D291F99 + li 9,0xb0 + .long 0x7D4A1F99 + li 10,0xc0 + .long 0x7E681F99 + .long 0x7E291F99 + .long 0x7E4A1F99 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size gcm_init_p8,.-gcm_init_p8 +.globl gcm_gmult_p8 +.type gcm_gmult_p8,@function +.align 5 +gcm_gmult_p8: +.localentry gcm_gmult_p8,0 + + lis 0,0xfff8 + li 8,0x10 + li 12,-1 + li 9,0x20 + or 0,0,0 + li 10,0x30 + .long 0x7C601E99 + + .long 0x7D682699 + lvsl 12,0,0 + .long 0x7D292699 + vspltisb 5,0x07 + .long 0x7D4A2699 + vxor 12,12,5 + .long 0x7D002699 + vperm 3,3,3,12 + vxor 4,4,4 + + .long 0x10035CC8 + .long 0x10234CC8 + .long 0x104354C8 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + vxor 0,0,6 + + vperm 0,0,0,12 + .long 0x7C001F99 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size gcm_gmult_p8,.-gcm_gmult_p8 + +.globl gcm_ghash_p8 +.type gcm_ghash_p8,@function +.align 5 +gcm_ghash_p8: +.localentry gcm_ghash_p8,0 + + li 0,-4096 + li 8,0x10 + li 12,-1 + li 9,0x20 + or 0,0,0 + li 10,0x30 + .long 0x7C001E99 + + .long 0x7D682699 + li 8,0x40 + lvsl 12,0,0 + .long 0x7D292699 + li 9,0x50 + vspltisb 5,0x07 + .long 0x7D4A2699 + li 10,0x60 + vxor 12,12,5 + .long 0x7D002699 + vperm 0,0,0,12 + vxor 4,4,4 + + cmpldi 6,64 + bge .Lgcm_ghash_p8_4x + + .long 0x7C602E99 + addi 5,5,16 + subic. 6,6,16 + vperm 3,3,3,12 + vxor 3,3,0 + beq .Lshort + + .long 0x7E682699 + li 8,16 + .long 0x7E292699 + add 9,5,6 + .long 0x7E4A2699 + + +.align 5 +.Loop_2x: + .long 0x7E002E99 + vperm 16,16,16,12 + + subic 6,6,32 + .long 0x10039CC8 + .long 0x11B05CC8 + subfe 0,0,0 + .long 0x10238CC8 + .long 0x11D04CC8 + and 0,0,6 + .long 0x104394C8 + .long 0x11F054C8 + add 5,5,0 + + vxor 0,0,13 + vxor 1,1,14 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 2,2,15 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + .long 0x7C682E99 + addi 5,5,32 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vperm 3,3,3,12 + vxor 6,6,2 + vxor 3,3,6 + vxor 3,3,0 + cmpld 9,5 + bgt .Loop_2x + + cmplwi 6,0 + bne .Leven + +.Lshort: + .long 0x10035CC8 + .long 0x10234CC8 + .long 0x104354C8 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + +.Leven: + vxor 0,0,6 + vperm 0,0,0,12 + .long 0x7C001F99 + + or 12,12,12 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.align 5 +.gcm_ghash_p8_4x: +.Lgcm_ghash_p8_4x: + stdu 1,-256(1) + li 10,63 + li 11,79 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,11,1 + addi 11,11,32 + stvx 24,10,1 + addi 10,10,32 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + li 10,0x60 + stvx 31,11,1 + li 0,-1 + stw 12,252(1) + or 0,0,0 + + lvsl 5,0,8 + + li 8,0x70 + .long 0x7E292699 + li 9,0x80 + vspltisb 6,8 + + li 10,0x90 + .long 0x7EE82699 + li 8,0xa0 + .long 0x7F092699 + li 9,0xb0 + .long 0x7F2A2699 + li 10,0xc0 + .long 0x7FA82699 + li 8,0x10 + .long 0x7FC92699 + li 9,0x20 + .long 0x7FEA2699 + li 10,0x30 + + vsldoi 7,4,6,8 + vaddubm 18,5,7 + vaddubm 19,6,18 + + srdi 6,6,4 + + .long 0x7C602E99 + .long 0x7E082E99 + subic. 6,6,8 + .long 0x7EC92E99 + .long 0x7F8A2E99 + addi 5,5,0x40 + vperm 3,3,3,12 + vperm 16,16,16,12 + vperm 22,22,22,12 + vperm 28,28,28,12 + + vxor 2,3,0 + + .long 0x11B0BCC8 + .long 0x11D0C4C8 + .long 0x11F0CCC8 + + vperm 11,17,9,18 + vperm 5,22,28,19 + vperm 10,17,9,19 + vperm 6,22,28,18 + .long 0x12B68CC8 + .long 0x12855CC8 + .long 0x137C4CC8 + .long 0x134654C8 + + vxor 21,21,14 + vxor 20,20,13 + vxor 27,27,21 + vxor 26,26,15 + + blt .Ltail_4x + +.Loop_4x: + .long 0x7C602E99 + .long 0x7E082E99 + subic. 6,6,4 + .long 0x7EC92E99 + .long 0x7F8A2E99 + addi 5,5,0x40 + vperm 16,16,16,12 + vperm 22,22,22,12 + vperm 28,28,28,12 + vperm 3,3,3,12 + + .long 0x1002ECC8 + .long 0x1022F4C8 + .long 0x1042FCC8 + .long 0x11B0BCC8 + .long 0x11D0C4C8 + .long 0x11F0CCC8 + + vxor 0,0,20 + vxor 1,1,27 + vxor 2,2,26 + vperm 5,22,28,19 + vperm 6,22,28,18 + + .long 0x10E044C8 + .long 0x12855CC8 + .long 0x134654C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x12B68CC8 + .long 0x137C4CC8 + .long 0x100044C8 + + vxor 20,20,13 + vxor 26,26,15 + vxor 2,2,3 + vxor 21,21,14 + vxor 2,2,6 + vxor 27,27,21 + vxor 2,2,0 + bge .Loop_4x + +.Ltail_4x: + .long 0x1002ECC8 + .long 0x1022F4C8 + .long 0x1042FCC8 + + vxor 0,0,20 + vxor 1,1,27 + + .long 0x10E044C8 + + vsldoi 5,1,4,8 + vsldoi 6,4,1,8 + vxor 2,2,26 + vxor 0,0,5 + vxor 2,2,6 + + vsldoi 0,0,0,8 + vxor 0,0,7 + + vsldoi 6,0,0,8 + .long 0x100044C8 + vxor 6,6,2 + vxor 0,0,6 + + addic. 6,6,4 + beq .Ldone_4x + + .long 0x7C602E99 + cmpldi 6,2 + li 6,-4 + blt .Lone + .long 0x7E082E99 + beq .Ltwo + +.Lthree: + .long 0x7EC92E99 + vperm 3,3,3,12 + vperm 16,16,16,12 + vperm 22,22,22,12 + + vxor 2,3,0 + vor 29,23,23 + vor 30, 24, 24 + vor 31,25,25 + + vperm 5,16,22,19 + vperm 6,16,22,18 + .long 0x12B08CC8 + .long 0x13764CC8 + .long 0x12855CC8 + .long 0x134654C8 + + vxor 27,27,21 + b .Ltail_4x + +.align 4 +.Ltwo: + vperm 3,3,3,12 + vperm 16,16,16,12 + + vxor 2,3,0 + vperm 5,4,16,19 + vperm 6,4,16,18 + + vsldoi 29,4,17,8 + vor 30, 17, 17 + vsldoi 31,17,4,8 + + .long 0x12855CC8 + .long 0x13704CC8 + .long 0x134654C8 + + b .Ltail_4x + +.align 4 +.Lone: + vperm 3,3,3,12 + + vsldoi 29,4,9,8 + vor 30, 9, 9 + vsldoi 31,9,4,8 + + vxor 2,3,0 + vxor 20,20,20 + vxor 27,27,27 + vxor 26,26,26 + + b .Ltail_4x + +.Ldone_4x: + vperm 0,0,0,12 + .long 0x7C001F99 + + li 10,63 + li 11,79 + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,11,1 + addi 11,11,32 + lvx 22,10,1 + addi 10,10,32 + lvx 23,11,1 + addi 11,11,32 + lvx 24,10,1 + addi 10,10,32 + lvx 25,11,1 + addi 11,11,32 + lvx 26,10,1 + addi 10,10,32 + lvx 27,11,1 + addi 11,11,32 + lvx 28,10,1 + addi 10,10,32 + lvx 29,11,1 + addi 11,11,32 + lvx 30,10,1 + lvx 31,11,1 + addi 1,1,256 + blr +.long 0 +.byte 0,12,0x04,0,0x80,0,4,0 +.long 0 +.size gcm_ghash_p8,.-gcm_ghash_p8 + +.byte 71,72,65,83,72,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && __powerpc64__ +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S index 519081bb98..feceb5d9f8 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 @@ -966,3 +972,4 @@ ChaCha20_ssse3: .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S index 319ed627f5..e43cfea942 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .hidden _x86_AES_encrypt_compact .type _x86_AES_encrypt_compact,@function @@ -979,12 +985,12 @@ _x86_AES_encrypt: .long 27,54,0,0 .long 0,0,0,0 .size _x86_AES_encrypt,.-_x86_AES_encrypt -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,@function +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,@function .align 16 -asm_AES_encrypt: -.L_asm_AES_encrypt_begin: +aes_nohw_encrypt: +.L_aes_nohw_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1044,7 +1050,7 @@ asm_AES_encrypt: popl %ebx popl %ebp ret -.size asm_AES_encrypt,.-.L_asm_AES_encrypt_begin +.size aes_nohw_encrypt,.-.L_aes_nohw_encrypt_begin .hidden _x86_AES_decrypt_compact .type _x86_AES_decrypt_compact,@function .align 16 @@ -2175,12 +2181,12 @@ _x86_AES_decrypt: .byte 23,43,4,126,186,119,214,38 .byte 225,105,20,99,85,33,12,125 .size _x86_AES_decrypt,.-_x86_AES_decrypt -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,@function +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,@function .align 16 -asm_AES_decrypt: -.L_asm_AES_decrypt_begin: +aes_nohw_decrypt: +.L_aes_nohw_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2240,13 +2246,13 @@ asm_AES_decrypt: popl %ebx popl %ebp ret -.size asm_AES_decrypt,.-.L_asm_AES_decrypt_begin -.globl asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.type asm_AES_cbc_encrypt,@function +.size aes_nohw_decrypt,.-.L_aes_nohw_decrypt_begin +.globl aes_nohw_cbc_encrypt +.hidden aes_nohw_cbc_encrypt +.type aes_nohw_cbc_encrypt,@function .align 16 -asm_AES_cbc_encrypt: -.L_asm_AES_cbc_encrypt_begin: +aes_nohw_cbc_encrypt: +.L_aes_nohw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2774,7 +2780,7 @@ asm_AES_cbc_encrypt: popl %ebx popl %ebp ret -.size asm_AES_cbc_encrypt,.-.L_asm_AES_cbc_encrypt_begin +.size aes_nohw_cbc_encrypt,.-.L_aes_nohw_cbc_encrypt_begin .hidden _x86_AES_set_encrypt_key .type _x86_AES_set_encrypt_key,@function .align 16 @@ -3006,21 +3012,21 @@ _x86_AES_set_encrypt_key: popl %ebp ret .size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,@function +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,@function .align 16 -asm_AES_set_encrypt_key: -.L_asm_AES_set_encrypt_key_begin: +aes_nohw_set_encrypt_key: +.L_aes_nohw_set_encrypt_key_begin: call _x86_AES_set_encrypt_key ret -.size asm_AES_set_encrypt_key,.-.L_asm_AES_set_encrypt_key_begin -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,@function +.size aes_nohw_set_encrypt_key,.-.L_aes_nohw_set_encrypt_key_begin +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,@function .align 16 -asm_AES_set_decrypt_key: -.L_asm_AES_set_decrypt_key_begin: +aes_nohw_set_decrypt_key: +.L_aes_nohw_set_decrypt_key_begin: call _x86_AES_set_encrypt_key cmpl $0,%eax je .L054proceed @@ -3249,8 +3255,9 @@ asm_AES_set_decrypt_key: popl %ebx popl %ebp ret -.size asm_AES_set_decrypt_key,.-.L_asm_AES_set_decrypt_key_begin +.size aes_nohw_set_decrypt_key,.-.L_aes_nohw_set_decrypt_key_begin .byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89 .byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 .byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S index cc53fa46df..a418869701 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S @@ -1,11 +1,31 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text -.globl aesni_encrypt -.hidden aesni_encrypt -.type aesni_encrypt,@function +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function .align 16 -aesni_encrypt: -.L_aesni_encrypt_begin: +aes_hw_encrypt: +.L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L000pic +.L000pic: + popl %ebx + leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -15,25 +35,25 @@ aesni_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L000enc1_loop_1: +.L001enc1_loop_1: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L000enc1_loop_1 + jnz .L001enc1_loop_1 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movups %xmm2,(%eax) pxor %xmm2,%xmm2 ret -.size aesni_encrypt,.-.L_aesni_encrypt_begin -.globl aesni_decrypt -.hidden aesni_decrypt -.type aesni_decrypt,@function +.size aes_hw_encrypt,.-.L_aes_hw_encrypt_begin +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function .align 16 -aesni_decrypt: -.L_aesni_decrypt_begin: +aes_hw_decrypt: +.L_aes_hw_decrypt_begin: movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -43,19 +63,19 @@ aesni_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L001dec1_loop_2: +.L002dec1_loop_2: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L001dec1_loop_2 + jnz .L002dec1_loop_2 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movups %xmm2,(%eax) pxor %xmm2,%xmm2 ret -.size aesni_decrypt,.-.L_aesni_decrypt_begin +.size aes_hw_decrypt,.-.L_aes_hw_decrypt_begin .hidden _aesni_encrypt2 .type _aesni_encrypt2,@function .align 16 @@ -69,7 +89,7 @@ _aesni_encrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L002enc2_loop: +.L003enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -77,7 +97,7 @@ _aesni_encrypt2: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L002enc2_loop + jnz .L003enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 @@ -97,7 +117,7 @@ _aesni_decrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L003dec2_loop: +.L004dec2_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 movups (%edx,%ecx,1),%xmm1 @@ -105,7 +125,7 @@ _aesni_decrypt2: .byte 102,15,56,222,208 .byte 102,15,56,222,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L003dec2_loop + jnz .L004dec2_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,223,208 @@ -126,7 +146,7 @@ _aesni_encrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L004enc3_loop: +.L005enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -136,7 +156,7 @@ _aesni_encrypt3: .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%edx,%ecx,1),%xmm0 - jnz .L004enc3_loop + jnz .L005enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -159,7 +179,7 @@ _aesni_decrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -.L005dec3_loop: +.L006dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -169,7 +189,7 @@ _aesni_decrypt3: .byte 102,15,56,222,216 .byte 102,15,56,222,224 movups -16(%edx,%ecx,1),%xmm0 - jnz .L005dec3_loop + jnz .L006dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -194,7 +214,7 @@ _aesni_encrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -.L006enc4_loop: +.L007enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -206,7 +226,7 @@ _aesni_encrypt4: .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%edx,%ecx,1),%xmm0 - jnz .L006enc4_loop + jnz .L007enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -233,7 +253,7 @@ _aesni_decrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -.L007dec4_loop: +.L008dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -245,7 +265,7 @@ _aesni_decrypt4: .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups -16(%edx,%ecx,1),%xmm0 - jnz .L007dec4_loop + jnz .L008dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -276,13 +296,13 @@ _aesni_encrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp .L008_aesni_encrypt6_inner + jmp .L009_aesni_encrypt6_inner .align 16 -.L009enc6_loop: +.L010enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 -.L008_aesni_encrypt6_inner: +.L009_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -296,7 +316,7 @@ _aesni_encrypt6: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L009enc6_loop + jnz .L010enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -331,13 +351,13 @@ _aesni_decrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp .L010_aesni_decrypt6_inner + jmp .L011_aesni_decrypt6_inner .align 16 -.L011dec6_loop: +.L012dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 -.L010_aesni_decrypt6_inner: +.L011_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -351,7 +371,7 @@ _aesni_decrypt6: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L011dec6_loop + jnz .L012dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -366,12 +386,12 @@ _aesni_decrypt6: .byte 102,15,56,223,248 ret .size _aesni_decrypt6,.-_aesni_decrypt6 -.globl aesni_ecb_encrypt -.hidden aesni_ecb_encrypt -.type aesni_ecb_encrypt,@function +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function .align 16 -aesni_ecb_encrypt: -.L_aesni_ecb_encrypt_begin: +aes_hw_ecb_encrypt: +.L_aes_hw_ecb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -382,14 +402,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L012ecb_ret + jz .L013ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L013ecb_decrypt + jz .L014ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L014ecb_enc_tail + jb .L015ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -398,9 +418,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L015ecb_enc_loop6_enter + jmp .L016ecb_enc_loop6_enter .align 16 -.L016ecb_enc_loop6: +.L017ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -415,12 +435,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L015ecb_enc_loop6_enter: +.L016ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L016ecb_enc_loop6 + jnc .L017ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -429,18 +449,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L012ecb_ret -.L014ecb_enc_tail: + jz .L013ecb_ret +.L015ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L017ecb_enc_one + jb .L018ecb_enc_one movups 16(%esi),%xmm3 - je .L018ecb_enc_two + je .L019ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L019ecb_enc_three + jb .L020ecb_enc_three movups 48(%esi),%xmm5 - je .L020ecb_enc_four + je .L021ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -449,49 +469,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L017ecb_enc_one: +.L018ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L021enc1_loop_3: +.L022enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L021enc1_loop_3 + jnz .L022enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L018ecb_enc_two: +.L019ecb_enc_two: call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L019ecb_enc_three: +.L020ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L020ecb_enc_four: +.L021ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L013ecb_decrypt: +.L014ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L022ecb_dec_tail + jb .L023ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -500,9 +520,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L023ecb_dec_loop6_enter + jmp .L024ecb_dec_loop6_enter .align 16 -.L024ecb_dec_loop6: +.L025ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -517,12 +537,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L023ecb_dec_loop6_enter: +.L024ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L024ecb_dec_loop6 + jnc .L025ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -531,18 +551,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L012ecb_ret -.L022ecb_dec_tail: + jz .L013ecb_ret +.L023ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L025ecb_dec_one + jb .L026ecb_dec_one movups 16(%esi),%xmm3 - je .L026ecb_dec_two + je .L027ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L027ecb_dec_three + jb .L028ecb_dec_three movups 48(%esi),%xmm5 - je .L028ecb_dec_four + je .L029ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -551,43 +571,43 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L025ecb_dec_one: +.L026ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L029dec1_loop_4: +.L030dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L029dec1_loop_4 + jnz .L030dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L026ecb_dec_two: +.L027ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L027ecb_dec_three: +.L028ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L012ecb_ret + jmp .L013ecb_ret .align 16 -.L028ecb_dec_four: +.L029ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L012ecb_ret: +.L013ecb_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -601,13 +621,13 @@ aesni_ecb_encrypt: popl %ebx popl %ebp ret -.size aesni_ecb_encrypt,.-.L_aesni_ecb_encrypt_begin -.globl aesni_ccm64_encrypt_blocks -.hidden aesni_ccm64_encrypt_blocks -.type aesni_ccm64_encrypt_blocks,@function +.size aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin +.globl aes_hw_ccm64_encrypt_blocks +.hidden aes_hw_ccm64_encrypt_blocks +.type aes_hw_ccm64_encrypt_blocks,@function .align 16 -aesni_ccm64_encrypt_blocks: -.L_aesni_ccm64_encrypt_blocks_begin: +aes_hw_ccm64_encrypt_blocks: +.L_aes_hw_ccm64_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -643,7 +663,7 @@ aesni_ccm64_encrypt_blocks: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -.L030ccm64_enc_outer: +.L031ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -652,7 +672,7 @@ aesni_ccm64_encrypt_blocks: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -.L031ccm64_enc2_loop: +.L032ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -660,7 +680,7 @@ aesni_ccm64_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L031ccm64_enc2_loop + jnz .L032ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -673,7 +693,7 @@ aesni_ccm64_encrypt_blocks: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz .L030ccm64_enc_outer + jnz .L031ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -690,13 +710,13 @@ aesni_ccm64_encrypt_blocks: popl %ebx popl %ebp ret -.size aesni_ccm64_encrypt_blocks,.-.L_aesni_ccm64_encrypt_blocks_begin -.globl aesni_ccm64_decrypt_blocks -.hidden aesni_ccm64_decrypt_blocks -.type aesni_ccm64_decrypt_blocks,@function +.size aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin +.globl aes_hw_ccm64_decrypt_blocks +.hidden aes_hw_ccm64_decrypt_blocks +.type aes_hw_ccm64_decrypt_blocks,@function .align 16 -aesni_ccm64_decrypt_blocks: -.L_aesni_ccm64_decrypt_blocks_begin: +aes_hw_ccm64_decrypt_blocks: +.L_aes_hw_ccm64_decrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -733,12 +753,12 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L032enc1_loop_5: +.L033enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L032enc1_loop_5 + jnz .L033enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -748,16 +768,16 @@ aesni_ccm64_decrypt_blocks: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp .L033ccm64_dec_outer + jmp .L034ccm64_dec_outer .align 16 -.L033ccm64_dec_outer: +.L034ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L034ccm64_dec_break + jz .L035ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -765,7 +785,7 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -.L035ccm64_dec2_loop: +.L036ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -773,7 +793,7 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L035ccm64_dec2_loop + jnz .L036ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -781,9 +801,9 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp .L033ccm64_dec_outer + jmp .L034ccm64_dec_outer .align 16 -.L034ccm64_dec_break: +.L035ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -791,12 +811,12 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L036enc1_loop_6: +.L037enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L036enc1_loop_6 + jnz .L037enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -814,17 +834,29 @@ aesni_ccm64_decrypt_blocks: popl %ebx popl %ebp ret -.size aesni_ccm64_decrypt_blocks,.-.L_aesni_ccm64_decrypt_blocks_begin -.globl aesni_ctr32_encrypt_blocks -.hidden aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,@function +.size aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function .align 16 -aesni_ctr32_encrypt_blocks: -.L_aesni_ctr32_encrypt_blocks_begin: +aes_hw_ctr32_encrypt_blocks: +.L_aes_hw_ctr32_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L038pic +.L038pic: + popl %ebx + leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax @@ -835,7 +867,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L037ctr32_one_shortcut + je .L039ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -873,7 +905,7 @@ aesni_ctr32_encrypt_blocks: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L038ctr32_tail + jb .L040ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -882,9 +914,9 @@ aesni_ctr32_encrypt_blocks: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L039ctr32_loop6 + jmp .L041ctr32_loop6 .align 16 -.L039ctr32_loop6: +.L041ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -938,27 +970,27 @@ aesni_ctr32_encrypt_blocks: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L039ctr32_loop6 + jnc .L041ctr32_loop6 addl $6,%eax - jz .L040ctr32_ret + jz .L042ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -.L038ctr32_tail: +.L040ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L041ctr32_one + jb .L043ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L042ctr32_two + je .L044ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L043ctr32_three + jb .L045ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L044ctr32_four + je .L046ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -976,29 +1008,29 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L037ctr32_one_shortcut: +.L039ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L041ctr32_one: +.L043ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L045enc1_loop_7: +.L047enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L045enc1_loop_7 + jnz .L047enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L042ctr32_two: +.L044ctr32_two: call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -1006,9 +1038,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L043ctr32_three: +.L045ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -1019,9 +1051,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L040ctr32_ret + jmp .L042ctr32_ret .align 16 -.L044ctr32_four: +.L046ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -1035,7 +1067,7 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L040ctr32_ret: +.L042ctr32_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1053,13 +1085,13 @@ aesni_ctr32_encrypt_blocks: popl %ebx popl %ebp ret -.size aesni_ctr32_encrypt_blocks,.-.L_aesni_ctr32_encrypt_blocks_begin -.globl aesni_xts_encrypt -.hidden aesni_xts_encrypt -.type aesni_xts_encrypt,@function +.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin +.globl aes_hw_xts_encrypt +.hidden aes_hw_xts_encrypt +.type aes_hw_xts_encrypt,@function .align 16 -aesni_xts_encrypt: -.L_aesni_xts_encrypt_begin: +aes_hw_xts_encrypt: +.L_aes_hw_xts_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1072,12 +1104,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L046enc1_loop_8: +.L048enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L046enc1_loop_8 + jnz .L048enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1101,14 +1133,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L047xts_enc_short + jc .L049xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L048xts_enc_loop6 + jmp .L050xts_enc_loop6 .align 16 -.L048xts_enc_loop6: +.L050xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1197,23 +1229,23 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L048xts_enc_loop6 + jnc .L050xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L047xts_enc_short: +.L049xts_enc_short: addl $96,%eax - jz .L049xts_enc_done6x + jz .L051xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L050xts_enc_one + jb .L052xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L051xts_enc_two + je .L053xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1222,7 +1254,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L052xts_enc_three + jb .L054xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1232,7 +1264,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L053xts_enc_four + je .L055xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1264,9 +1296,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L050xts_enc_one: +.L052xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1274,20 +1306,20 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L055enc1_loop_9: +.L057enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L055enc1_loop_9 + jnz .L057enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L051xts_enc_two: +.L053xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1301,9 +1333,9 @@ aesni_xts_encrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L052xts_enc_three: +.L054xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1321,9 +1353,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L053xts_enc_four: +.L055xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1345,28 +1377,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L054xts_enc_done + jmp .L056xts_enc_done .align 16 -.L049xts_enc_done6x: +.L051xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L056xts_enc_ret + jz .L058xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L057xts_enc_steal + jmp .L059xts_enc_steal .align 16 -.L054xts_enc_done: +.L056xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L056xts_enc_ret + jz .L058xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L057xts_enc_steal: +.L059xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1374,7 +1406,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L057xts_enc_steal + jnz .L059xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1384,16 +1416,16 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L058enc1_loop_10: +.L060enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L058enc1_loop_10 + jnz .L060enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L056xts_enc_ret: +.L058xts_enc_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1414,13 +1446,13 @@ aesni_xts_encrypt: popl %ebx popl %ebp ret -.size aesni_xts_encrypt,.-.L_aesni_xts_encrypt_begin -.globl aesni_xts_decrypt -.hidden aesni_xts_decrypt -.type aesni_xts_decrypt,@function +.size aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin +.globl aes_hw_xts_decrypt +.hidden aes_hw_xts_decrypt +.type aes_hw_xts_decrypt,@function .align 16 -aesni_xts_decrypt: -.L_aesni_xts_decrypt_begin: +aes_hw_xts_decrypt: +.L_aes_hw_xts_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1433,12 +1465,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L059enc1_loop_11: +.L061enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L059enc1_loop_11 + jnz .L061enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1467,14 +1499,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L060xts_dec_short + jc .L062xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L061xts_dec_loop6 + jmp .L063xts_dec_loop6 .align 16 -.L061xts_dec_loop6: +.L063xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1563,23 +1595,23 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L061xts_dec_loop6 + jnc .L063xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L060xts_dec_short: +.L062xts_dec_short: addl $96,%eax - jz .L062xts_dec_done6x + jz .L064xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L063xts_dec_one + jb .L065xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L064xts_dec_two + je .L066xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1588,7 +1620,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L065xts_dec_three + jb .L067xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1598,7 +1630,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L066xts_dec_four + je .L068xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1630,9 +1662,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L063xts_dec_one: +.L065xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1640,20 +1672,20 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L068dec1_loop_12: +.L070dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L068dec1_loop_12 + jnz .L070dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L064xts_dec_two: +.L066xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1667,9 +1699,9 @@ aesni_xts_decrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L065xts_dec_three: +.L067xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1687,9 +1719,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L066xts_dec_four: +.L068xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1711,20 +1743,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L067xts_dec_done + jmp .L069xts_dec_done .align 16 -.L062xts_dec_done6x: +.L064xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L069xts_dec_ret + jz .L071xts_dec_ret movl %eax,112(%esp) - jmp .L070xts_dec_only_one_more + jmp .L072xts_dec_only_one_more .align 16 -.L067xts_dec_done: +.L069xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L069xts_dec_ret + jz .L071xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1734,7 +1766,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L070xts_dec_only_one_more: +.L072xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1748,16 +1780,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L071dec1_loop_13: +.L073dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L071dec1_loop_13 + jnz .L073dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L072xts_dec_steal: +.L074xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1765,7 +1797,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L072xts_dec_steal + jnz .L074xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1775,16 +1807,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L073dec1_loop_14: +.L075dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L073dec1_loop_14 + jnz .L075dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L069xts_dec_ret: +.L071xts_dec_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1805,13 +1837,13 @@ aesni_xts_decrypt: popl %ebx popl %ebp ret -.size aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin -.globl aesni_cbc_encrypt -.hidden aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function +.size aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function .align 16 -aesni_cbc_encrypt: -.L_aesni_cbc_encrypt_begin: +aes_hw_cbc_encrypt: +.L_aes_hw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1825,7 +1857,7 @@ aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L074cbc_abort + jz .L076cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1833,14 +1865,14 @@ aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L075cbc_decrypt + je .L077cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L076cbc_enc_tail + jb .L078cbc_enc_tail subl $16,%eax - jmp .L077cbc_enc_loop + jmp .L079cbc_enc_loop .align 16 -.L077cbc_enc_loop: +.L079cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1848,25 +1880,25 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L078enc1_loop_15: +.L080enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L078enc1_loop_15 + jnz .L080enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L077cbc_enc_loop + jnc .L079cbc_enc_loop addl $16,%eax - jnz .L076cbc_enc_tail + jnz .L078cbc_enc_tail movaps %xmm2,%xmm7 pxor %xmm2,%xmm2 - jmp .L079cbc_ret -.L076cbc_enc_tail: + jmp .L081cbc_ret +.L078cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1877,20 +1909,20 @@ aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L077cbc_enc_loop + jmp .L079cbc_enc_loop .align 16 -.L075cbc_decrypt: +.L077cbc_decrypt: cmpl $80,%eax - jbe .L080cbc_dec_tail + jbe .L082cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L081cbc_dec_loop6_enter + jmp .L083cbc_dec_loop6_enter .align 16 -.L082cbc_dec_loop6: +.L084cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L081cbc_dec_loop6_enter: +.L083cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1920,28 +1952,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L082cbc_dec_loop6 + ja .L084cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L083cbc_dec_clear_tail_collected + jle .L085cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L080cbc_dec_tail: +.L082cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L084cbc_dec_one + jbe .L086cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L085cbc_dec_two + jbe .L087cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L086cbc_dec_three + jbe .L088cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L087cbc_dec_four + jbe .L089cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1968,26 +2000,26 @@ aesni_cbc_encrypt: movaps %xmm6,%xmm2 pxor %xmm6,%xmm6 subl $80,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L084cbc_dec_one: +.L086cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L089dec1_loop_16: +.L091dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L089dec1_loop_16 + jnz .L091dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L085cbc_dec_two: +.L087cbc_dec_two: call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1997,9 +2029,9 @@ aesni_cbc_encrypt: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L086cbc_dec_three: +.L088cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -2012,9 +2044,9 @@ aesni_cbc_encrypt: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L087cbc_dec_four: +.L089cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -2032,21 +2064,21 @@ aesni_cbc_encrypt: movaps %xmm5,%xmm2 pxor %xmm5,%xmm5 subl $64,%eax - jmp .L088cbc_dec_tail_collected + jmp .L090cbc_dec_tail_collected .align 16 -.L083cbc_dec_clear_tail_collected: +.L085cbc_dec_clear_tail_collected: pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 -.L088cbc_dec_tail_collected: +.L090cbc_dec_tail_collected: andl $15,%eax - jnz .L090cbc_dec_tail_partial + jnz .L092cbc_dec_tail_partial movups %xmm2,(%edi) pxor %xmm0,%xmm0 - jmp .L079cbc_ret + jmp .L081cbc_ret .align 16 -.L090cbc_dec_tail_partial: +.L092cbc_dec_tail_partial: movaps %xmm2,(%esp) pxor %xmm0,%xmm0 movl $16,%ecx @@ -2054,20 +2086,20 @@ aesni_cbc_encrypt: subl %eax,%ecx .long 2767451785 movdqa %xmm2,(%esp) -.L079cbc_ret: +.L081cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp pxor %xmm2,%xmm2 pxor %xmm1,%xmm1 movups %xmm7,(%ebp) pxor %xmm7,%xmm7 -.L074cbc_abort: +.L076cbc_abort: popl %edi popl %esi popl %ebx popl %ebp ret -.size aesni_cbc_encrypt,.-.L_aesni_cbc_encrypt_begin +.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin .hidden _aesni_set_encrypt_key .type _aesni_set_encrypt_key,@function .align 16 @@ -2075,13 +2107,13 @@ _aesni_set_encrypt_key: pushl %ebp pushl %ebx testl %eax,%eax - jz .L091bad_pointer + jz .L093bad_pointer testl %edx,%edx - jz .L091bad_pointer - call .L092pic -.L092pic: + jz .L093bad_pointer + call .L094pic +.L094pic: popl %ebx - leal .Lkey_const-.L092pic(%ebx),%ebx + leal .Lkey_const-.L094pic(%ebx),%ebx leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 @@ -2089,45 +2121,45 @@ _aesni_set_encrypt_key: leal 16(%edx),%edx andl $268437504,%ebp cmpl $256,%ecx - je .L09314rounds + je .L09514rounds cmpl $192,%ecx - je .L09412rounds + je .L09612rounds cmpl $128,%ecx - jne .L095bad_keybits + jne .L097bad_keybits .align 16 -.L09610rounds: +.L09810rounds: cmpl $268435456,%ebp - je .L09710rounds_alt + je .L09910rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L098key_128_cold + call .L100key_128_cold .byte 102,15,58,223,200,2 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,4 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,8 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,16 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,32 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,64 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,128 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,27 - call .L099key_128 + call .L101key_128 .byte 102,15,58,223,200,54 - call .L099key_128 + call .L101key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L099key_128: +.L101key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L098key_128_cold: +.L100key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2136,13 +2168,13 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L09710rounds_alt: +.L09910rounds_alt: movdqa (%ebx),%xmm5 movl $8,%ecx movdqa 32(%ebx),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) -.L101loop_key128: +.L103loop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 @@ -2158,7 +2190,7 @@ _aesni_set_encrypt_key: movdqu %xmm0,-16(%edx) movdqa %xmm0,%xmm2 decl %ecx - jnz .L101loop_key128 + jnz .L103loop_key128 movdqa 48(%ebx),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 @@ -2186,41 +2218,41 @@ _aesni_set_encrypt_key: movdqu %xmm0,16(%edx) movl $9,%ecx movl %ecx,96(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L09412rounds: +.L09612rounds: movq 16(%eax),%xmm2 cmpl $268435456,%ebp - je .L10212rounds_alt + je .L10412rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L103key_192a_cold + call .L105key_192a_cold .byte 102,15,58,223,202,2 - call .L104key_192b + call .L106key_192b .byte 102,15,58,223,202,4 - call .L105key_192a + call .L107key_192a .byte 102,15,58,223,202,8 - call .L104key_192b + call .L106key_192b .byte 102,15,58,223,202,16 - call .L105key_192a + call .L107key_192a .byte 102,15,58,223,202,32 - call .L104key_192b + call .L106key_192b .byte 102,15,58,223,202,64 - call .L105key_192a + call .L107key_192a .byte 102,15,58,223,202,128 - call .L104key_192b + call .L106key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L105key_192a: +.L107key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L103key_192a_cold: +.L105key_192a_cold: movaps %xmm2,%xmm5 -.L106key_192b_warm: +.L108key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2234,21 +2266,21 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L104key_192b: +.L106key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L106key_192b_warm + jmp .L108key_192b_warm .align 16 -.L10212rounds_alt: +.L10412rounds_alt: movdqa 16(%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $8,%ecx movdqu %xmm0,-16(%edx) -.L107loop_key192: +.L109loop_key192: movq %xmm2,(%edx) movdqa %xmm2,%xmm1 .byte 102,15,56,0,213 @@ -2270,54 +2302,54 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 movdqu %xmm0,-16(%edx) decl %ecx - jnz .L107loop_key192 + jnz .L109loop_key192 movl $11,%ecx movl %ecx,32(%edx) - jmp .L100good_key + jmp .L102good_key .align 16 -.L09314rounds: +.L09514rounds: movups 16(%eax),%xmm2 leal 16(%edx),%edx cmpl $268435456,%ebp - je .L10814rounds_alt + je .L11014rounds_alt movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L109key_256a_cold + call .L111key_256a_cold .byte 102,15,58,223,200,1 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,2 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,2 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,4 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,4 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,8 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,8 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,16 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,16 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,32 - call .L111key_256a + call .L113key_256a .byte 102,15,58,223,200,32 - call .L110key_256b + call .L112key_256b .byte 102,15,58,223,202,64 - call .L111key_256a + call .L113key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - jmp .L100good_key + jmp .L102good_key .align 16 -.L111key_256a: +.L113key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L109key_256a_cold: +.L111key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2326,7 +2358,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L110key_256b: +.L112key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2337,14 +2369,14 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm2 ret .align 16 -.L10814rounds_alt: +.L11014rounds_alt: movdqa (%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $7,%ecx movdqu %xmm0,-32(%edx) movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) -.L112loop_key256: +.L114loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 @@ -2358,7 +2390,7 @@ _aesni_set_encrypt_key: pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) decl %ecx - jz .L113done_key256 + jz .L115done_key256 pshufd $255,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -2373,11 +2405,11 @@ _aesni_set_encrypt_key: movdqu %xmm2,16(%edx) leal 32(%edx),%edx movdqa %xmm2,%xmm1 - jmp .L112loop_key256 -.L113done_key256: + jmp .L114loop_key256 +.L115done_key256: movl $13,%ecx movl %ecx,16(%edx) -.L100good_key: +.L102good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2389,37 +2421,49 @@ _aesni_set_encrypt_key: popl %ebp ret .align 4 -.L091bad_pointer: +.L093bad_pointer: movl $-1,%eax popl %ebx popl %ebp ret .align 4 -.L095bad_keybits: +.L097bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx popl %ebp ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key -.globl aesni_set_encrypt_key -.hidden aesni_set_encrypt_key -.type aesni_set_encrypt_key,@function +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function .align 16 -aesni_set_encrypt_key: -.L_aesni_set_encrypt_key_begin: +aes_hw_set_encrypt_key: +.L_aes_hw_set_encrypt_key_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L116pic +.L116pic: + popl %ebx + leal BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx call _aesni_set_encrypt_key ret -.size aesni_set_encrypt_key,.-.L_aesni_set_encrypt_key_begin -.globl aesni_set_decrypt_key -.hidden aesni_set_decrypt_key -.type aesni_set_decrypt_key,@function +.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function .align 16 -aesni_set_decrypt_key: -.L_aesni_set_decrypt_key_begin: +aes_hw_set_decrypt_key: +.L_aes_hw_set_decrypt_key_begin: movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -2427,7 +2471,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L114dec_key_ret + jnz .L117dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2435,7 +2479,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L115dec_key_inverse: +.L118dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2445,16 +2489,16 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L115dec_key_inverse + ja .L118dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 xorl %eax,%eax -.L114dec_key_ret: +.L117dec_key_ret: ret -.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin +.size aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin .align 64 .Lkey_const: .long 202313229,202313229,202313229,202313229 @@ -2466,3 +2510,4 @@ aesni_set_decrypt_key: .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S index cc067f717e..4a6ccfbfac 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_add_words .hidden bn_mul_add_words @@ -1535,3 +1541,4 @@ bn_sub_part_words: ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S index 56834d0a6f..837b0cb5c7 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_comba8 .hidden bn_mul_comba8 @@ -1257,3 +1263,4 @@ bn_sqr_comba4: ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S new file mode 100644 index 0000000000..3e5f2d7e54 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S @@ -0,0 +1,294 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl gcm_gmult_ssse3 +.hidden gcm_gmult_ssse3 +.type gcm_gmult_ssse3,@function +.align 16 +gcm_gmult_ssse3: +.L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call .L000pic_point +.L000pic_point: + popl %eax + movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7 + movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2 +.byte 102,15,56,0,199 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L001loop_row_1: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L002loop_row_2: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +.L003loop_row_3: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,0,215 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin +.globl gcm_ghash_ssse3 +.hidden gcm_ghash_ssse3 +.type gcm_ghash_ssse3,@function +.align 16 +gcm_ghash_ssse3: +.L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call .L004pic_point +.L004pic_point: + popl %ebx + movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7 + andl $-16,%ecx +.byte 102,15,56,0,199 + pxor %xmm3,%xmm3 +.L005loop_ghash: + movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 +.byte 102,15,56,0,207 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +.L006loop_row_4: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L007loop_row_5: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +.L008loop_row_6: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz .L005loop_ghash +.byte 102,15,56,0,199 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin +.align 16 +.Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 16 +.Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S index a384d9a039..7016235c0a 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl gcm_gmult_4bit_mmx .hidden gcm_gmult_4bit_mmx @@ -1066,3 +1072,4 @@ gcm_ghash_clmul: .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S index 7237f95bec..6de8ff886a 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl md5_block_asm_data_order .hidden md5_block_asm_data_order @@ -679,3 +685,4 @@ md5_block_asm_data_order: ret .size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S index 2c022ec4af..4449e38f72 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl sha1_block_data_order .hidden sha1_block_data_order @@ -3799,3 +3805,4 @@ _sha1_block_data_order_avx: .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S index 984758f3b2..f61fa3df72 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl sha256_block_data_order .hidden sha256_block_data_order @@ -5558,3 +5564,4 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S index 3617ce48b4..89fb50b4ca 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl sha512_block_data_order .hidden sha512_block_data_order @@ -2828,3 +2834,4 @@ sha512_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S index 0417b7e353..8807116950 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S @@ -1,5 +1,13 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text +#ifdef BORINGSSL_DISPATCH_TEST +#endif .align 64 .L_vpaes_consts: .long 218628480,235210255,168496130,67568393 @@ -477,6 +485,18 @@ vpaes_set_encrypt_key: pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L016pic +.L016pic: + popl %ebx + leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%eax @@ -490,9 +510,9 @@ vpaes_set_encrypt_key: movl %ebx,240(%edx) movl $48,%ecx movl $0,%edi - leal .L_vpaes_consts+0x30-.L016pic_point,%ebp + leal .L_vpaes_consts+0x30-.L017pic_point,%ebp call _vpaes_schedule_core -.L016pic_point: +.L017pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -529,9 +549,9 @@ vpaes_set_decrypt_key: shrl $1,%ecx andl $32,%ecx xorl $32,%ecx - leal .L_vpaes_consts+0x30-.L017pic_point,%ebp + leal .L_vpaes_consts+0x30-.L018pic_point,%ebp call _vpaes_schedule_core -.L017pic_point: +.L018pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -550,9 +570,21 @@ vpaes_encrypt: pushl %ebx pushl %esi pushl %edi - leal .L_vpaes_consts+0x30-.L018pic_point,%ebp +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L019pic +.L019pic: + popl %ebx + leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal .L_vpaes_consts+0x30-.L020pic_point,%ebp call _vpaes_preheat -.L018pic_point: +.L020pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -580,9 +612,9 @@ vpaes_decrypt: pushl %ebx pushl %esi pushl %edi - leal .L_vpaes_consts+0x30-.L019pic_point,%ebp + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp call _vpaes_preheat -.L019pic_point: +.L021pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -615,7 +647,7 @@ vpaes_cbc_encrypt: movl 28(%esp),%eax movl 32(%esp),%edx subl $16,%eax - jc .L020cbc_abort + jc .L022cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -628,14 +660,14 @@ vpaes_cbc_encrypt: movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal .L_vpaes_consts+0x30-.L021pic_point,%ebp + leal .L_vpaes_consts+0x30-.L023pic_point,%ebp call _vpaes_preheat -.L021pic_point: +.L023pic_point: cmpl $0,%ecx - je .L022cbc_dec_loop - jmp .L023cbc_enc_loop + je .L024cbc_dec_loop + jmp .L025cbc_enc_loop .align 16 -.L023cbc_enc_loop: +.L025cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call _vpaes_encrypt_core @@ -645,10 +677,10 @@ vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L023cbc_enc_loop - jmp .L024cbc_done + jnc .L025cbc_enc_loop + jmp .L026cbc_done .align 16 -.L022cbc_dec_loop: +.L024cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -660,12 +692,12 @@ vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L022cbc_dec_loop -.L024cbc_done: + jnc .L024cbc_dec_loop +.L026cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) -.L020cbc_abort: +.L022cbc_abort: popl %edi popl %esi popl %ebx @@ -673,3 +705,4 @@ vpaes_cbc_encrypt: ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S index 3fb668826b..f2c6fde7c6 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl bn_mul_mont .hidden bn_mul_mont @@ -446,16 +452,18 @@ bn_mul_mont: leal 1(%edx),%edx jge .L017sub sbbl $0,%eax - andl %eax,%esi - notl %eax - movl %edi,%ebp - andl %eax,%ebp - orl %ebp,%esi + movl $-1,%edx + xorl %eax,%edx + jmp .L018copy .align 16 .L018copy: - movl (%esi,%ebx,4),%eax - movl %eax,(%edi,%ebx,4) + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) decl %ebx jge .L018copy movl 24(%esp),%esp @@ -473,3 +481,4 @@ bn_mul_mont: .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S b/packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S new file mode 100644 index 0000000000..13eb677c97 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S @@ -0,0 +1,206 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl abi_test_trampoline +.hidden abi_test_trampoline +.type abi_test_trampoline,@function +.align 16 +abi_test_trampoline: +.L_abi_test_trampoline_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%ecx + movl (%ecx),%esi + movl 4(%ecx),%edi + movl 8(%ecx),%ebx + movl 12(%ecx),%ebp + subl $44,%esp + movl 72(%esp),%eax + xorl %ecx,%ecx +.L000loop: + cmpl 76(%esp),%ecx + jae .L001loop_done + movl (%eax,%ecx,4),%edx + movl %edx,(%esp,%ecx,4) + addl $1,%ecx + jmp .L000loop +.L001loop_done: + call *64(%esp) + addl $44,%esp + movl 24(%esp),%ecx + movl %esi,(%ecx) + movl %edi,4(%ecx) + movl %ebx,8(%ecx) + movl %ebp,12(%ecx) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size abi_test_trampoline,.-.L_abi_test_trampoline_begin +.globl abi_test_get_and_clear_direction_flag +.hidden abi_test_get_and_clear_direction_flag +.type abi_test_get_and_clear_direction_flag,@function +.align 16 +abi_test_get_and_clear_direction_flag: +.L_abi_test_get_and_clear_direction_flag_begin: + pushfl + popl %eax + andl $1024,%eax + shrl $10,%eax + cld + ret +.size abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin +.globl abi_test_set_direction_flag +.hidden abi_test_set_direction_flag +.type abi_test_set_direction_flag,@function +.align 16 +abi_test_set_direction_flag: +.L_abi_test_set_direction_flag_begin: + std + ret +.size abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin +.globl abi_test_clobber_eax +.hidden abi_test_clobber_eax +.type abi_test_clobber_eax,@function +.align 16 +abi_test_clobber_eax: +.L_abi_test_clobber_eax_begin: + xorl %eax,%eax + ret +.size abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin +.globl abi_test_clobber_ebx +.hidden abi_test_clobber_ebx +.type abi_test_clobber_ebx,@function +.align 16 +abi_test_clobber_ebx: +.L_abi_test_clobber_ebx_begin: + xorl %ebx,%ebx + ret +.size abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin +.globl abi_test_clobber_ecx +.hidden abi_test_clobber_ecx +.type abi_test_clobber_ecx,@function +.align 16 +abi_test_clobber_ecx: +.L_abi_test_clobber_ecx_begin: + xorl %ecx,%ecx + ret +.size abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin +.globl abi_test_clobber_edx +.hidden abi_test_clobber_edx +.type abi_test_clobber_edx,@function +.align 16 +abi_test_clobber_edx: +.L_abi_test_clobber_edx_begin: + xorl %edx,%edx + ret +.size abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin +.globl abi_test_clobber_edi +.hidden abi_test_clobber_edi +.type abi_test_clobber_edi,@function +.align 16 +abi_test_clobber_edi: +.L_abi_test_clobber_edi_begin: + xorl %edi,%edi + ret +.size abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin +.globl abi_test_clobber_esi +.hidden abi_test_clobber_esi +.type abi_test_clobber_esi,@function +.align 16 +abi_test_clobber_esi: +.L_abi_test_clobber_esi_begin: + xorl %esi,%esi + ret +.size abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin +.globl abi_test_clobber_ebp +.hidden abi_test_clobber_ebp +.type abi_test_clobber_ebp,@function +.align 16 +abi_test_clobber_ebp: +.L_abi_test_clobber_ebp_begin: + xorl %ebp,%ebp + ret +.size abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin +.globl abi_test_clobber_xmm0 +.hidden abi_test_clobber_xmm0 +.type abi_test_clobber_xmm0,@function +.align 16 +abi_test_clobber_xmm0: +.L_abi_test_clobber_xmm0_begin: + pxor %xmm0,%xmm0 + ret +.size abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin +.globl abi_test_clobber_xmm1 +.hidden abi_test_clobber_xmm1 +.type abi_test_clobber_xmm1,@function +.align 16 +abi_test_clobber_xmm1: +.L_abi_test_clobber_xmm1_begin: + pxor %xmm1,%xmm1 + ret +.size abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin +.globl abi_test_clobber_xmm2 +.hidden abi_test_clobber_xmm2 +.type abi_test_clobber_xmm2,@function +.align 16 +abi_test_clobber_xmm2: +.L_abi_test_clobber_xmm2_begin: + pxor %xmm2,%xmm2 + ret +.size abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin +.globl abi_test_clobber_xmm3 +.hidden abi_test_clobber_xmm3 +.type abi_test_clobber_xmm3,@function +.align 16 +abi_test_clobber_xmm3: +.L_abi_test_clobber_xmm3_begin: + pxor %xmm3,%xmm3 + ret +.size abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin +.globl abi_test_clobber_xmm4 +.hidden abi_test_clobber_xmm4 +.type abi_test_clobber_xmm4,@function +.align 16 +abi_test_clobber_xmm4: +.L_abi_test_clobber_xmm4_begin: + pxor %xmm4,%xmm4 + ret +.size abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin +.globl abi_test_clobber_xmm5 +.hidden abi_test_clobber_xmm5 +.type abi_test_clobber_xmm5,@function +.align 16 +abi_test_clobber_xmm5: +.L_abi_test_clobber_xmm5_begin: + pxor %xmm5,%xmm5 + ret +.size abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin +.globl abi_test_clobber_xmm6 +.hidden abi_test_clobber_xmm6 +.type abi_test_clobber_xmm6,@function +.align 16 +abi_test_clobber_xmm6: +.L_abi_test_clobber_xmm6_begin: + pxor %xmm6,%xmm6 + ret +.size abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin +.globl abi_test_clobber_xmm7 +.hidden abi_test_clobber_xmm7 +.type abi_test_clobber_xmm7,@function +.align 16 +abi_test_clobber_xmm7: +.L_abi_test_clobber_xmm7_begin: + pxor %xmm7,%xmm7 + ret +.size abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S index 62dc77999a..b76713398d 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -38,6 +50,7 @@ .type ChaCha20_ctr32,@function .align 64 ChaCha20_ctr32: +.cfi_startproc cmpq $0,%rdx je .Lno_data movq OPENSSL_ia32cap_P+4(%rip),%r10 @@ -45,12 +58,25 @@ ChaCha20_ctr32: jnz .LChaCha20_ssse3 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-56 subq $64+24,%rsp +.cfi_adjust_cfa_offset 88 .Lctr32_body: @@ -291,20 +317,30 @@ ChaCha20_ctr32: .Ldone: leaq 64+24+48(%rsp),%rsi movq -48(%rsi),%r15 +.cfi_restore r15 movq -40(%rsi),%r14 +.cfi_restore r14 movq -32(%rsi),%r13 +.cfi_restore r13 movq -24(%rsi),%r12 +.cfi_restore r12 movq -16(%rsi),%rbp +.cfi_restore rbp movq -8(%rsi),%rbx +.cfi_restore rbx leaq (%rsi),%rsp +.cfi_adjust_cfa_offset -136 .Lno_data: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 .type ChaCha20_ssse3,@function .align 32 ChaCha20_ssse3: .LChaCha20_ssse3: +.cfi_startproc movq %rsp,%r9 +.cfi_def_cfa_register r9 cmpq $128,%rdx ja .LChaCha20_4x @@ -430,14 +466,18 @@ ChaCha20_ssse3: .Ldone_ssse3: leaq (%r9),%rsp +.cfi_def_cfa_register rsp .Lssse3_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 .type ChaCha20_4x,@function .align 32 ChaCha20_4x: .LChaCha20_4x: +.cfi_startproc movq %rsp,%r9 +.cfi_def_cfa_register r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -978,14 +1018,18 @@ ChaCha20_4x: .Ldone4x: leaq (%r9),%rsp +.cfi_def_cfa_register rsp .L4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x .type ChaCha20_8x,@function .align 32 ChaCha20_8x: .LChaCha20_8x: +.cfi_startproc movq %rsp,%r9 +.cfi_def_cfa_register r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper @@ -1580,7 +1624,10 @@ ChaCha20_8x: .Ldone8x: vzeroall leaq (%r9),%rsp +.cfi_def_cfa_register rsp .L8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S index 42e25f4817..a22bee8fcf 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .data .align 16 @@ -3064,3 +3076,4 @@ aes256gcmsiv_kdf: .cfi_endproc .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index a6f5e07d9c..e313348808 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8972,3 +8984,4 @@ seal_avx2_short_tail: jmp seal_sse_tail_16 .cfi_endproc #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S index ff87f9824e..47a69ec862 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .type _x86_64_AES_encrypt,@function .align 16 @@ -156,6 +168,7 @@ _x86_64_AES_encrypt: .type _x86_64_AES_encrypt_compact,@function .align 16 _x86_64_AES_encrypt_compact: +.cfi_startproc leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -325,20 +338,29 @@ _x86_64_AES_encrypt_compact: xorl 8(%r15),%ecx xorl 12(%r15),%edx .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact .align 16 -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,@function -.hidden asm_AES_encrypt -asm_AES_encrypt: +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,@function +.hidden aes_nohw_encrypt +aes_nohw_encrypt: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 leaq -63(%rdx),%rcx @@ -351,6 +373,7 @@ asm_AES_encrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 .Lenc_prologue: movq %rdx,%r15 @@ -377,21 +400,30 @@ asm_AES_encrypt: movq 16(%rsp),%r9 movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lenc_epilogue: .byte 0xf3,0xc3 -.size asm_AES_encrypt,.-asm_AES_encrypt +.cfi_endproc +.size aes_nohw_encrypt,.-aes_nohw_encrypt .type _x86_64_AES_decrypt,@function .align 16 _x86_64_AES_decrypt: @@ -550,6 +582,7 @@ _x86_64_AES_decrypt: .type _x86_64_AES_decrypt_compact,@function .align 16 _x86_64_AES_decrypt_compact: +.cfi_startproc leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -771,20 +804,29 @@ _x86_64_AES_decrypt_compact: xorl 8(%r15),%ecx xorl 12(%r15),%edx .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact .align 16 -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,@function -.hidden asm_AES_decrypt -asm_AES_decrypt: +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,@function +.hidden aes_nohw_decrypt +aes_nohw_decrypt: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 leaq -63(%rdx),%rcx @@ -797,6 +839,7 @@ asm_AES_decrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x18,0x06,0x23,0x08 .Ldec_prologue: movq %rdx,%r15 @@ -825,47 +868,75 @@ asm_AES_decrypt: movq 16(%rsp),%r9 movq 24(%rsp),%rsi +.cfi_def_cfa %rsi,8 movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Ldec_epilogue: .byte 0xf3,0xc3 -.size asm_AES_decrypt,.-asm_AES_decrypt +.cfi_endproc +.size aes_nohw_decrypt,.-aes_nohw_decrypt .align 16 -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,@function -asm_AES_set_encrypt_key: +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,@function +aes_nohw_set_encrypt_key: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $8,%rsp +.cfi_adjust_cfa_offset 8 .Lenc_key_prologue: call _x86_64_AES_set_encrypt_key movq 40(%rsp),%rbp +.cfi_restore %rbp movq 48(%rsp),%rbx +.cfi_restore %rbx addq $56,%rsp +.cfi_adjust_cfa_offset -56 .Lenc_key_epilogue: .byte 0xf3,0xc3 -.size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key +.cfi_endproc +.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key .type _x86_64_AES_set_encrypt_key,@function .align 16 _x86_64_AES_set_encrypt_key: +.cfi_startproc movl %esi,%ecx movq %rdi,%rsi movq %rdx,%rdi @@ -1101,19 +1172,34 @@ _x86_64_AES_set_encrypt_key: movq $-1,%rax .Lexit: .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key .align 16 -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,@function -asm_AES_set_decrypt_key: +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,@function +aes_nohw_set_decrypt_key: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 pushq %rdx +.cfi_adjust_cfa_offset 8 .Ldec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1281,32 +1367,56 @@ asm_AES_set_decrypt_key: xorq %rax,%rax .Labort: movq 8(%rsp),%r15 +.cfi_restore %r15 movq 16(%rsp),%r14 +.cfi_restore %r14 movq 24(%rsp),%r13 +.cfi_restore %r13 movq 32(%rsp),%r12 +.cfi_restore %r12 movq 40(%rsp),%rbp +.cfi_restore %rbp movq 48(%rsp),%rbx +.cfi_restore %rbx addq $56,%rsp +.cfi_adjust_cfa_offset -56 .Ldec_key_epilogue: .byte 0xf3,0xc3 -.size asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key +.cfi_endproc +.size aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key .align 16 -.globl asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.type asm_AES_cbc_encrypt,@function +.globl aes_nohw_cbc_encrypt +.hidden aes_nohw_cbc_encrypt +.type aes_nohw_cbc_encrypt,@function .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P -.hidden asm_AES_cbc_encrypt -asm_AES_cbc_encrypt: +.hidden aes_nohw_cbc_encrypt +aes_nohw_cbc_encrypt: +.cfi_startproc cmpq $0,%rdx je .Lcbc_epilogue pushfq + + +.cfi_adjust_cfa_offset 8 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-32 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-40 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-48 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-56 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-64 .Lcbc_prologue: cld @@ -1317,6 +1427,7 @@ asm_AES_cbc_encrypt: cmpq $0,%r9 cmoveq %r10,%r14 +.cfi_remember_state leaq OPENSSL_ia32cap_P(%rip),%r10 movl (%r10),%r10d cmpq $512,%rdx @@ -1352,8 +1463,10 @@ asm_AES_cbc_encrypt: .Lcbc_te_ok: xchgq %rsp,%r15 +.cfi_def_cfa_register %r15 movq %r15,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_fast_body: movq %rdi,24(%rsp) movq %rsi,32(%rsp) @@ -1551,6 +1664,7 @@ asm_AES_cbc_encrypt: .align 16 .Lcbc_slow_prologue: +.cfi_restore_state leaq -88(%rsp),%rbp andq $-64,%rbp @@ -1562,8 +1676,10 @@ asm_AES_cbc_encrypt: subq %r10,%rbp xchgq %rsp,%rbp +.cfi_def_cfa_register %rbp movq %rbp,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x40 .Lcbc_slow_body: @@ -1735,18 +1851,30 @@ asm_AES_cbc_encrypt: .align 16 .Lcbc_exit: movq 16(%rsp),%rsi +.cfi_def_cfa %rsi,64 movq (%rsi),%r15 +.cfi_restore %r15 movq 8(%rsi),%r14 +.cfi_restore %r14 movq 16(%rsi),%r13 +.cfi_restore %r13 movq 24(%rsi),%r12 +.cfi_restore %r12 movq 32(%rsi),%rbp +.cfi_restore %rbp movq 40(%rsi),%rbx +.cfi_restore %rbx leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,16 .Lcbc_popfq: popfq + + +.cfi_adjust_cfa_offset -8 .Lcbc_epilogue: .byte 0xf3,0xc3 -.size asm_AES_cbc_encrypt,.-asm_AES_cbc_encrypt +.cfi_endproc +.size aes_nohw_cbc_encrypt,.-aes_nohw_cbc_encrypt .align 64 .LAES_Te: .long 0xa56363c6,0xa56363c6 @@ -2534,3 +2662,4 @@ asm_AES_cbc_encrypt: .byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S index e7b4c48bef..65ab5c78fe 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .type _aesni_ctr32_ghash_6x,@function @@ -544,6 +556,11 @@ _aesni_ctr32_6x: .align 32 aesni_gcm_encrypt: .cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+2(%rip) +#endif xorq %r10,%r10 @@ -832,3 +849,4 @@ aesni_gcm_encrypt: .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S index 0c980a304b..b98107f369 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -1,12 +1,30 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P -.globl aesni_encrypt -.hidden aesni_encrypt -.type aesni_encrypt,@function +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function .align 16 -aesni_encrypt: +aes_hw_encrypt: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+1(%rip) +#endif movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -25,13 +43,15 @@ aesni_encrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 -.size aesni_encrypt,.-aesni_encrypt +.cfi_endproc +.size aes_hw_encrypt,.-aes_hw_encrypt -.globl aesni_decrypt -.hidden aesni_decrypt -.type aesni_decrypt,@function +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function .align 16 -aesni_decrypt: +aes_hw_decrypt: +.cfi_startproc movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -50,10 +70,12 @@ aesni_decrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 -.size aesni_decrypt, .-aesni_decrypt +.cfi_endproc +.size aes_hw_decrypt, .-aes_hw_decrypt .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -79,10 +101,12 @@ _aesni_encrypt2: .byte 102,15,56,221,208 .byte 102,15,56,221,216 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt2,.-_aesni_encrypt2 .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -108,10 +132,12 @@ _aesni_decrypt2: .byte 102,15,56,223,208 .byte 102,15,56,223,216 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -142,10 +168,12 @@ _aesni_encrypt3: .byte 102,15,56,221,216 .byte 102,15,56,221,224 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt3,.-_aesni_encrypt3 .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -176,10 +204,12 @@ _aesni_decrypt3: .byte 102,15,56,223,216 .byte 102,15,56,223,224 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt3,.-_aesni_decrypt3 .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -216,10 +246,12 @@ _aesni_encrypt4: .byte 102,15,56,221,224 .byte 102,15,56,221,232 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt4,.-_aesni_encrypt4 .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -256,10 +288,12 @@ _aesni_decrypt4: .byte 102,15,56,223,224 .byte 102,15,56,223,232 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt4,.-_aesni_decrypt4 .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -310,10 +344,12 @@ _aesni_encrypt6: .byte 102,15,56,221,240 .byte 102,15,56,221,248 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt6,.-_aesni_encrypt6 .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -364,10 +400,12 @@ _aesni_decrypt6: .byte 102,15,56,223,240 .byte 102,15,56,223,248 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt6,.-_aesni_decrypt6 .type _aesni_encrypt8,@function .align 16 _aesni_encrypt8: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -428,10 +466,12 @@ _aesni_encrypt8: .byte 102,68,15,56,221,192 .byte 102,68,15,56,221,200 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_encrypt8,.-_aesni_encrypt8 .type _aesni_decrypt8,@function .align 16 _aesni_decrypt8: +.cfi_startproc movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -492,12 +532,14 @@ _aesni_decrypt8: .byte 102,68,15,56,223,192 .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_decrypt8,.-_aesni_decrypt8 -.globl aesni_ecb_encrypt -.hidden aesni_ecb_encrypt -.type aesni_ecb_encrypt,@function +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function .align 16 -aesni_ecb_encrypt: +aes_hw_ecb_encrypt: +.cfi_startproc andq $-16,%rdx jz .Lecb_ret @@ -835,174 +877,17 @@ aesni_ecb_encrypt: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 .byte 0xf3,0xc3 -.size aesni_ecb_encrypt,.-aesni_ecb_encrypt -.globl aesni_ccm64_encrypt_blocks -.hidden aesni_ccm64_encrypt_blocks -.type aesni_ccm64_encrypt_blocks,@function +.cfi_endproc +.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function .align 16 -aesni_ccm64_encrypt_blocks: - movl 240(%rcx),%eax - movdqu (%r8),%xmm6 - movdqa .Lincrement64(%rip),%xmm9 - movdqa .Lbswap_mask(%rip),%xmm7 - - shll $4,%eax - movl $16,%r10d - leaq 0(%rcx),%r11 - movdqu (%r9),%xmm3 - movdqa %xmm6,%xmm2 - leaq 32(%rcx,%rax,1),%rcx -.byte 102,15,56,0,247 - subq %rax,%r10 - jmp .Lccm64_enc_outer -.align 16 -.Lccm64_enc_outer: - movups (%r11),%xmm0 - movq %r10,%rax - movups (%rdi),%xmm8 - - xorps %xmm0,%xmm2 - movups 16(%r11),%xmm1 - xorps %xmm8,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%r11),%xmm0 - -.Lccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq %xmm9,%xmm6 - decq %rdx -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - - leaq 16(%rdi),%rdi - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) -.byte 102,15,56,0,215 - leaq 16(%rsi),%rsi - jnz .Lccm64_enc_outer - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks -.globl aesni_ccm64_decrypt_blocks -.hidden aesni_ccm64_decrypt_blocks -.type aesni_ccm64_decrypt_blocks,@function -.align 16 -aesni_ccm64_decrypt_blocks: - movl 240(%rcx),%eax - movups (%r8),%xmm6 - movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm9 - movdqa .Lbswap_mask(%rip),%xmm7 - - movaps %xmm6,%xmm2 - movl %eax,%r10d - movq %rcx,%r11 -.byte 102,15,56,0,247 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_5: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 -.byte 102,15,56,221,209 - shll $4,%r10d - movl $16,%eax - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 - leaq 16(%rdi),%rdi - subq %r10,%rax - leaq 32(%r11,%r10,1),%rcx - movq %rax,%r10 - jmp .Lccm64_dec_outer -.align 16 -.Lccm64_dec_outer: - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi -.byte 102,15,56,0,215 - - subq $1,%rdx - jz .Lccm64_dec_break - - movups (%r11),%xmm0 - movq %r10,%rax - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - xorps %xmm0,%xmm2 - xorps %xmm8,%xmm3 - movups 32(%r11),%xmm0 - jmp .Lccm64_dec2_loop -.align 16 -.Lccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lccm64_dec2_loop - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leaq 16(%rdi),%rdi - jmp .Lccm64_dec_outer - -.align 16 -.Lccm64_dec_break: - - movl 240(%r11),%eax - movups (%r11),%xmm0 - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%r11),%r11 - xorps %xmm8,%xmm3 -.Loop_enc1_6: -.byte 102,15,56,220,217 - decl %eax - movups (%r11),%xmm1 - leaq 16(%r11),%r11 - jnz .Loop_enc1_6 -.byte 102,15,56,221,217 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks -.globl aesni_ctr32_encrypt_blocks -.hidden aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,@function -.align 16 -aesni_ctr32_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit(%rip) +#endif cmpq $1,%rdx jne .Lctr32_bulk @@ -1015,12 +900,12 @@ aesni_ctr32_encrypt_blocks: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_enc1_7: +.Loop_enc1_5: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 + jnz .Loop_enc1_5 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -1033,7 +918,9 @@ aesni_ctr32_encrypt_blocks: .align 16 .Lctr32_bulk: leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $128,%rsp andq $-16,%rsp @@ -1568,1798 +1455,19 @@ aesni_ctr32_encrypt_blocks: movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 movq -8(%r11),%rbp +.cfi_restore %rbp leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lctr32_epilogue: .byte 0xf3,0xc3 -.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks -.globl aesni_xts_encrypt -.hidden aesni_xts_encrypt -.type aesni_xts_encrypt,@function +.cfi_endproc +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function .align 16 -aesni_xts_encrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -.Loop_enc1_8: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz .Loop_enc1_8 -.byte 102,15,56,221,209 - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa .Lxts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc .Lxts_enc_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq .Lxts_magic(%rip),%r8 - jmp .Lxts_enc_grandloop - -.align 32 -.Lxts_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,220,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,220,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,220,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,220,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,220,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp .Lxts_enc_loop6 -.align 32 -.Lxts_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz .Lxts_enc_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,220,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,220,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,220,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,220,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,220,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,221,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,221,92,36,16 -.byte 102,15,56,221,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,221,108,36,48 -.byte 102,15,56,221,116,36,64 -.byte 102,15,56,221,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc .Lxts_enc_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -.Lxts_enc_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - addq $96,%rdx - jz .Lxts_enc_done - - pxor %xmm0,%xmm11 - cmpq $0x20,%rdx - jb .Lxts_enc_one - pxor %xmm0,%xmm12 - je .Lxts_enc_two - - pxor %xmm0,%xmm13 - cmpq $0x40,%rdx - jb .Lxts_enc_three - pxor %xmm0,%xmm14 - je .Lxts_enc_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm7 - - call _aesni_encrypt6 - - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_9: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_9 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_encrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_encrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_encrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_done: - andq $15,%r9 - jz .Lxts_enc_ret - movq %r9,%rdx - -.Lxts_enc_steal: - movzbl (%rdi),%eax - movzbl -16(%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,-16(%rsi) - movb %cl,0(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz .Lxts_enc_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups -16(%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_10: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_10 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,-16(%rsi) - -.Lxts_enc_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -.Lxts_enc_epilogue: - .byte 0xf3,0xc3 -.size aesni_xts_encrypt,.-aesni_xts_encrypt -.globl aesni_xts_decrypt -.hidden aesni_xts_decrypt -.type aesni_xts_decrypt,@function -.align 16 -aesni_xts_decrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -.Loop_enc1_11: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz .Loop_enc1_11 -.byte 102,15,56,221,209 - xorl %eax,%eax - testq $15,%rdx - setnz %al - shlq $4,%rax - subq %rax,%rdx - - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa .Lxts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc .Lxts_dec_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq .Lxts_magic(%rip),%r8 - jmp .Lxts_dec_grandloop - -.align 32 -.Lxts_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,222,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,222,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,222,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,222,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,222,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp .Lxts_dec_loop6 -.align 32 -.Lxts_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz .Lxts_dec_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,222,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,222,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,222,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,222,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,222,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,223,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,223,92,36,16 -.byte 102,15,56,223,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,223,108,36,48 -.byte 102,15,56,223,116,36,64 -.byte 102,15,56,223,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc .Lxts_dec_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -.Lxts_dec_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 - addq $96,%rdx - jz .Lxts_dec_done - - pxor %xmm0,%xmm12 - cmpq $0x20,%rdx - jb .Lxts_dec_one - pxor %xmm0,%xmm13 - je .Lxts_dec_two - - pxor %xmm0,%xmm14 - cmpq $0x40,%rdx - jb .Lxts_dec_three - je .Lxts_dec_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - - call _aesni_decrypt6 - - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm14 - movdqu %xmm5,48(%rsi) - pcmpgtd %xmm15,%xmm14 - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - pshufd $0x13,%xmm14,%xmm11 - andq $15,%r9 - jz .Lxts_dec_ret - - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm11 - pxor %xmm15,%xmm11 - jmp .Lxts_dec_done2 - -.align 16 -.Lxts_dec_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_12: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_12 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - movdqa %xmm12,%xmm11 - leaq 16(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_decrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm13,%xmm11 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_decrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm14,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_decrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqa %xmm15,%xmm11 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_done: - andq $15,%r9 - jz .Lxts_dec_ret -.Lxts_dec_done2: - movq %r9,%rdx - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rdi),%xmm2 - xorps %xmm11,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_13: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_13 -.byte 102,15,56,223,209 - xorps %xmm11,%xmm2 - movups %xmm2,(%rsi) - -.Lxts_dec_steal: - movzbl 16(%rdi),%eax - movzbl (%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,(%rsi) - movb %cl,16(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz .Lxts_dec_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_14: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_14 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - -.Lxts_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -.Lxts_dec_epilogue: - .byte 0xf3,0xc3 -.size aesni_xts_decrypt,.-aesni_xts_decrypt -.globl aesni_ocb_encrypt -.hidden aesni_ocb_encrypt -.type aesni_ocb_encrypt,@function -.align 32 -aesni_ocb_encrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz .Locb_enc_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - subq $1,%rdx - jz .Locb_enc_done - -.Locb_enc_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc .Locb_enc_short - jmp .Locb_enc_grandloop - -.align 32 -.Locb_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_encrypt6 - - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc .Locb_enc_grandloop - -.Locb_enc_short: - addq $6,%rdx - jz .Locb_enc_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb .Locb_enc_one - movdqu 16(%rdi),%xmm3 - je .Locb_enc_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb .Locb_enc_three - movdqu 48(%rdi),%xmm5 - je .Locb_enc_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_encrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_one: - movdqa %xmm10,%xmm7 - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - jmp .Locb_enc_done - -.align 16 -.Locb_enc_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_three: - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - - jmp .Locb_enc_done - -.align 16 -.Locb_enc_four: - call __ocb_encrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - -.Locb_enc_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Locb_enc_epilogue: - .byte 0xf3,0xc3 -.size aesni_ocb_encrypt,.-aesni_ocb_encrypt - -.type __ocb_encrypt6,@function -.align 32 -__ocb_encrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm6,%xmm8 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm8 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp .Locb_enc_loop6 - -.align 32 -.Locb_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,221,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - .byte 0xf3,0xc3 -.size __ocb_encrypt6,.-__ocb_encrypt6 - -.type __ocb_encrypt4,@function -.align 32 -__ocb_encrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups 64(%r11),%xmm0 - jmp .Locb_enc_loop4 - -.align 32 -.Locb_enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,221,210 -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - .byte 0xf3,0xc3 -.size __ocb_encrypt4,.-__ocb_encrypt4 - -.type __ocb_encrypt1,@function -.align 32 -__ocb_encrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm2,%xmm8 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,220,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,220,208 - movups 64(%r11),%xmm0 - jmp .Locb_enc_loop1 - -.align 32 -.Locb_enc_loop1: -.byte 102,15,56,220,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_enc_loop1 - -.byte 102,15,56,220,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,221,215 - .byte 0xf3,0xc3 -.size __ocb_encrypt1,.-__ocb_encrypt1 - -.globl aesni_ocb_decrypt -.hidden aesni_ocb_decrypt -.type aesni_ocb_decrypt,@function -.align 32 -aesni_ocb_decrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz .Locb_dec_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm8 - leaq 16(%rsi),%rsi - subq $1,%rdx - jz .Locb_dec_done - -.Locb_dec_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc .Locb_dec_short - jmp .Locb_dec_grandloop - -.align 32 -.Locb_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_decrypt6 - - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm8 - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc .Locb_dec_grandloop - -.Locb_dec_short: - addq $6,%rdx - jz .Locb_dec_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb .Locb_dec_one - movdqu 16(%rdi),%xmm3 - je .Locb_dec_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb .Locb_dec_three - movdqu 48(%rdi),%xmm5 - je .Locb_dec_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_decrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_one: - movdqa %xmm10,%xmm7 - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - jmp .Locb_dec_done - -.align 16 -.Locb_dec_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_three: - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - movups %xmm4,32(%rsi) - xorps %xmm4,%xmm8 - - jmp .Locb_dec_done - -.align 16 -.Locb_dec_four: - call __ocb_decrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - -.Locb_dec_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Locb_dec_epilogue: - .byte 0xf3,0xc3 -.size aesni_ocb_decrypt,.-aesni_ocb_decrypt - -.type __ocb_decrypt6,@function -.align 32 -__ocb_decrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp .Locb_dec_loop6 - -.align 32 -.Locb_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,223,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 -.byte 102,65,15,56,223,255 - .byte 0xf3,0xc3 -.size __ocb_decrypt6,.-__ocb_decrypt6 - -.type __ocb_decrypt4,@function -.align 32 -__ocb_decrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups 64(%r11),%xmm0 - jmp .Locb_dec_loop4 - -.align 32 -.Locb_dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,223,210 -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 - .byte 0xf3,0xc3 -.size __ocb_decrypt4,.-__ocb_decrypt4 - -.type __ocb_decrypt1,@function -.align 32 -__ocb_decrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,222,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,222,208 - movups 64(%r11),%xmm0 - jmp .Locb_dec_loop1 - -.align 32 -.Locb_dec_loop1: -.byte 102,15,56,222,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Locb_dec_loop1 - -.byte 102,15,56,222,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,223,215 - .byte 0xf3,0xc3 -.size __ocb_decrypt1,.-__ocb_decrypt1 -.globl aesni_cbc_encrypt -.hidden aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function -.align 16 -aesni_cbc_encrypt: +aes_hw_cbc_encrypt: +.cfi_startproc testq %rdx,%rdx jz .Lcbc_ret @@ -3384,12 +1492,12 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm3 leaq 32(%rcx),%rcx xorps %xmm3,%xmm2 -.Loop_enc1_15: +.Loop_enc1_6: .byte 102,15,56,220,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 + jnz .Loop_enc1_6 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -3435,12 +1543,12 @@ aesni_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_16: +.Loop_dec1_7: .byte 102,15,56,222,209 decl %r10d movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_7 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -3453,7 +1561,9 @@ aesni_cbc_encrypt: .align 16 .Lcbc_decrypt_bulk: leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 pushq %rbp +.cfi_offset %rbp,-16 subq $16,%rsp andq $-16,%rsp movq %rcx,%rbp @@ -3851,12 +1961,12 @@ aesni_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_17: +.Loop_dec1_8: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_17 + jnz .Loop_dec1_8 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -3938,16 +2048,21 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 movq -8(%r11),%rbp +.cfi_restore %rbp leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lcbc_ret: .byte 0xf3,0xc3 -.size aesni_cbc_encrypt,.-aesni_cbc_encrypt -.globl aesni_set_decrypt_key -.hidden aesni_set_decrypt_key -.type aesni_set_decrypt_key,@function +.cfi_endproc +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function .align 16 -aesni_set_decrypt_key: +aes_hw_set_decrypt_key: +.cfi_startproc .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3980,16 +2095,23 @@ aesni_set_decrypt_key: pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_decrypt_key: -.size aesni_set_decrypt_key,.-aesni_set_decrypt_key -.globl aesni_set_encrypt_key -.hidden aesni_set_encrypt_key -.type aesni_set_encrypt_key,@function +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function .align 16 -aesni_set_encrypt_key: +aes_hw_set_encrypt_key: __aesni_set_encrypt_key: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif .byte 0x48,0x83,0xEC,0x08 +.cfi_adjust_cfa_offset 8 movq $-1,%rax testq %rdi,%rdi jz .Lenc_key_ret @@ -4283,7 +2405,9 @@ __aesni_set_encrypt_key: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp +.cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 +.cfi_endproc .LSEH_end_set_encrypt_key: .align 16 @@ -4354,7 +2478,7 @@ __aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 -.size aesni_set_encrypt_key,.-aesni_set_encrypt_key +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key .align 64 .Lbswap_mask: @@ -4379,3 +2503,4 @@ __aesni_set_encrypt_key: .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S deleted file mode 100644 index 04b161c995..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ /dev/null @@ -1,2503 +0,0 @@ -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -.text - -.extern asm_AES_encrypt -.hidden asm_AES_encrypt -.extern asm_AES_decrypt -.hidden asm_AES_decrypt - -.type _bsaes_encrypt8,@function -.align 64 -_bsaes_encrypt8: - leaq .LBS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa 80(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp .Lenc_sbox -.align 16 -.Lenc_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -.Lenc_sbox: - pxor %xmm5,%xmm4 - pxor %xmm0,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm1,%xmm5 - pxor %xmm15,%xmm4 - - pxor %xmm2,%xmm5 - pxor %xmm6,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm3,%xmm2 - pxor %xmm4,%xmm3 - pxor %xmm0,%xmm2 - - pxor %xmm6,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm6,%xmm10 - movdqa %xmm0,%xmm9 - movdqa %xmm4,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm11 - - pxor %xmm3,%xmm10 - pxor %xmm1,%xmm9 - pxor %xmm2,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm3,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm15,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm2,%xmm11 - pxor %xmm15,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm6,%xmm12 - movdqa %xmm4,%xmm11 - pxor %xmm0,%xmm12 - pxor %xmm5,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm1,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm3,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm0,%xmm13 - pand %xmm2,%xmm11 - movdqa %xmm6,%xmm14 - pand %xmm15,%xmm12 - pand %xmm4,%xmm13 - por %xmm5,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm5,%xmm11 - movdqa %xmm4,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm5,%xmm9 - pxor %xmm4,%xmm5 - pand %xmm14,%xmm4 - pand %xmm13,%xmm5 - pxor %xmm4,%xmm5 - pxor %xmm9,%xmm4 - pxor %xmm15,%xmm11 - pxor %xmm2,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm2,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm2 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm2,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm11,%xmm5 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm2 - - movdqa %xmm6,%xmm11 - movdqa %xmm0,%xmm7 - pxor %xmm3,%xmm11 - pxor %xmm1,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm3,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm1,%xmm3 - pand %xmm14,%xmm7 - pand %xmm12,%xmm1 - pand %xmm13,%xmm11 - pand %xmm8,%xmm3 - pxor %xmm11,%xmm7 - pxor %xmm1,%xmm3 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm1 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm6,%xmm10 - pxor %xmm0,%xmm6 - pand %xmm14,%xmm0 - pand %xmm13,%xmm6 - pxor %xmm0,%xmm6 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm6 - pxor %xmm11,%xmm3 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm1 - pxor %xmm15,%xmm6 - pxor %xmm5,%xmm0 - pxor %xmm6,%xmm3 - pxor %xmm15,%xmm5 - pxor %xmm0,%xmm15 - - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 - pxor %xmm2,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm4,%xmm3 - - pxor %xmm2,%xmm5 - decl %r10d - jl .Lenc_done - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm3,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm5,%xmm10 - pxor %xmm9,%xmm3 - pshufd $0x93,%xmm2,%xmm11 - pxor %xmm10,%xmm5 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm2 - pshufd $0x93,%xmm1,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm1 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm2,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm5,%xmm11 - pshufd $0x4E,%xmm2,%xmm7 - pxor %xmm1,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm3,%xmm10 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm1,%xmm5 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm12,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm14,%xmm6 - pxor %xmm13,%xmm5 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm1 - movdqa %xmm8,%xmm4 - movdqa 48(%r11),%xmm7 - jnz .Lenc_loop - movdqa 64(%r11),%xmm7 - jmp .Lenc_loop -.align 16 -.Lenc_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm2,%xmm10 - psrlq $1,%xmm2 - pxor %xmm4,%xmm1 - pxor %xmm6,%xmm2 - pand %xmm7,%xmm1 - pand %xmm7,%xmm2 - pxor %xmm1,%xmm4 - psllq $1,%xmm1 - pxor %xmm2,%xmm6 - psllq $1,%xmm2 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm2 - movdqa %xmm3,%xmm9 - psrlq $1,%xmm3 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm5,%xmm3 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm3 - pand %xmm7,%xmm15 - pxor %xmm3,%xmm5 - psllq $1,%xmm3 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm2,%xmm10 - psrlq $2,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm1,%xmm2 - pand %xmm8,%xmm6 - pand %xmm8,%xmm2 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm2,%xmm1 - psllq $2,%xmm2 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm2 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm5,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm5 - psllq $2,%xmm0 - pxor %xmm15,%xmm3 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm5,%xmm9 - psrlq $4,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $4,%xmm3 - pxor %xmm4,%xmm5 - pxor %xmm1,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm4 - psllq $4,%xmm5 - pxor %xmm3,%xmm1 - psllq $4,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm2,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm2 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 -.size _bsaes_encrypt8,.-_bsaes_encrypt8 - -.type _bsaes_decrypt8,@function -.align 64 -_bsaes_decrypt8: - leaq .LBS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa -48(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp .Ldec_sbox -.align 16 -.Ldec_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -.Ldec_sbox: - pxor %xmm3,%xmm2 - - pxor %xmm6,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm3,%xmm5 - pxor %xmm5,%xmm6 - pxor %xmm6,%xmm0 - - pxor %xmm0,%xmm15 - pxor %xmm4,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm15,%xmm4 - pxor %xmm2,%xmm0 - movdqa %xmm2,%xmm10 - movdqa %xmm6,%xmm9 - movdqa %xmm0,%xmm8 - movdqa %xmm3,%xmm12 - movdqa %xmm4,%xmm11 - - pxor %xmm15,%xmm10 - pxor %xmm3,%xmm9 - pxor %xmm5,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm15,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm1,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm5,%xmm11 - pxor %xmm1,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm2,%xmm12 - movdqa %xmm0,%xmm11 - pxor %xmm6,%xmm12 - pxor %xmm4,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm3,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm15,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm6,%xmm13 - pand %xmm5,%xmm11 - movdqa %xmm2,%xmm14 - pand %xmm1,%xmm12 - pand %xmm0,%xmm13 - por %xmm4,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm4,%xmm11 - movdqa %xmm0,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm4,%xmm9 - pxor %xmm0,%xmm4 - pand %xmm14,%xmm0 - pand %xmm13,%xmm4 - pxor %xmm0,%xmm4 - pxor %xmm9,%xmm0 - pxor %xmm1,%xmm11 - pxor %xmm5,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm1,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm5,%xmm1 - pand %xmm14,%xmm7 - pand %xmm12,%xmm5 - pand %xmm13,%xmm11 - pand %xmm8,%xmm1 - pxor %xmm11,%xmm7 - pxor %xmm5,%xmm1 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm5 - pxor %xmm11,%xmm4 - pxor %xmm11,%xmm1 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm5 - - movdqa %xmm2,%xmm11 - movdqa %xmm6,%xmm7 - pxor %xmm15,%xmm11 - pxor %xmm3,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm3,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm3 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm3,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm3 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm2,%xmm10 - pxor %xmm6,%xmm2 - pand %xmm14,%xmm6 - pand %xmm13,%xmm2 - pxor %xmm6,%xmm2 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm4,%xmm5 - - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm6,%xmm4 - pxor %xmm1,%xmm3 - pxor %xmm15,%xmm6 - pxor %xmm4,%xmm3 - pxor %xmm5,%xmm2 - pxor %xmm0,%xmm5 - pxor %xmm3,%xmm2 - - pxor %xmm15,%xmm3 - pxor %xmm2,%xmm6 - decl %r10d - jl .Ldec_done - - pshufd $0x4E,%xmm15,%xmm7 - pshufd $0x4E,%xmm2,%xmm13 - pxor %xmm15,%xmm7 - pshufd $0x4E,%xmm4,%xmm14 - pxor %xmm2,%xmm13 - pshufd $0x4E,%xmm0,%xmm8 - pxor %xmm4,%xmm14 - pshufd $0x4E,%xmm5,%xmm9 - pxor %xmm0,%xmm8 - pshufd $0x4E,%xmm3,%xmm10 - pxor %xmm5,%xmm9 - pxor %xmm13,%xmm15 - pxor %xmm13,%xmm0 - pshufd $0x4E,%xmm1,%xmm11 - pxor %xmm3,%xmm10 - pxor %xmm7,%xmm5 - pxor %xmm8,%xmm3 - pshufd $0x4E,%xmm6,%xmm12 - pxor %xmm1,%xmm11 - pxor %xmm14,%xmm0 - pxor %xmm9,%xmm1 - pxor %xmm6,%xmm12 - - pxor %xmm14,%xmm5 - pxor %xmm13,%xmm3 - pxor %xmm13,%xmm1 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm14,%xmm1 - pxor %xmm14,%xmm6 - pxor %xmm12,%xmm4 - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm5,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm3,%xmm10 - pxor %xmm9,%xmm5 - pshufd $0x93,%xmm1,%xmm11 - pxor %xmm10,%xmm3 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm1 - pshufd $0x93,%xmm2,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm2 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm1,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm3,%xmm11 - pshufd $0x4E,%xmm1,%xmm7 - pxor %xmm2,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm5,%xmm10 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm2,%xmm3 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm12,%xmm8 - pxor %xmm1,%xmm10 - pxor %xmm14,%xmm6 - pxor %xmm3,%xmm13 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm2 - movdqa %xmm13,%xmm5 - movdqa %xmm8,%xmm4 - movdqa %xmm2,%xmm1 - movdqa %xmm10,%xmm2 - movdqa -16(%r11),%xmm7 - jnz .Ldec_loop - movdqa -32(%r11),%xmm7 - jmp .Ldec_loop -.align 16 -.Ldec_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm2,%xmm9 - psrlq $1,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $1,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm6,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm4 - psllq $1,%xmm2 - pxor %xmm1,%xmm6 - psllq $1,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm3,%xmm5 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm5 - pand %xmm7,%xmm15 - pxor %xmm5,%xmm3 - psllq $1,%xmm5 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm1,%xmm10 - psrlq $2,%xmm1 - pxor %xmm4,%xmm6 - pxor %xmm2,%xmm1 - pand %xmm8,%xmm6 - pand %xmm8,%xmm1 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm1,%xmm2 - psllq $2,%xmm1 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm3 - psllq $2,%xmm0 - pxor %xmm15,%xmm5 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm3,%xmm9 - psrlq $4,%xmm3 - movdqa %xmm5,%xmm10 - psrlq $4,%xmm5 - pxor %xmm4,%xmm3 - pxor %xmm2,%xmm5 - pand %xmm7,%xmm3 - pand %xmm7,%xmm5 - pxor %xmm3,%xmm4 - psllq $4,%xmm3 - pxor %xmm5,%xmm2 - psllq $4,%xmm5 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm5 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm1 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 -.size _bsaes_decrypt8,.-_bsaes_decrypt8 -.type _bsaes_key_convert,@function -.align 16 -_bsaes_key_convert: - leaq .Lmasks(%rip),%r11 - movdqu (%rcx),%xmm7 - leaq 16(%rcx),%rcx - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - movdqa 48(%r11),%xmm3 - movdqa 64(%r11),%xmm4 - pcmpeqd %xmm5,%xmm5 - - movdqu (%rcx),%xmm6 - movdqa %xmm7,(%rax) - leaq 16(%rax),%rax - decl %r10d - jmp .Lkey_loop -.align 16 -.Lkey_loop: -.byte 102,15,56,0,244 - - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - - pand %xmm6,%xmm8 - pand %xmm6,%xmm9 - movdqa %xmm2,%xmm10 - pcmpeqb %xmm0,%xmm8 - psllq $4,%xmm0 - movdqa %xmm3,%xmm11 - pcmpeqb %xmm1,%xmm9 - psllq $4,%xmm1 - - pand %xmm6,%xmm10 - pand %xmm6,%xmm11 - movdqa %xmm0,%xmm12 - pcmpeqb %xmm2,%xmm10 - psllq $4,%xmm2 - movdqa %xmm1,%xmm13 - pcmpeqb %xmm3,%xmm11 - psllq $4,%xmm3 - - movdqa %xmm2,%xmm14 - movdqa %xmm3,%xmm15 - pxor %xmm5,%xmm8 - pxor %xmm5,%xmm9 - - pand %xmm6,%xmm12 - pand %xmm6,%xmm13 - movdqa %xmm8,0(%rax) - pcmpeqb %xmm0,%xmm12 - psrlq $4,%xmm0 - movdqa %xmm9,16(%rax) - pcmpeqb %xmm1,%xmm13 - psrlq $4,%xmm1 - leaq 16(%rcx),%rcx - - pand %xmm6,%xmm14 - pand %xmm6,%xmm15 - movdqa %xmm10,32(%rax) - pcmpeqb %xmm2,%xmm14 - psrlq $4,%xmm2 - movdqa %xmm11,48(%rax) - pcmpeqb %xmm3,%xmm15 - psrlq $4,%xmm3 - movdqu (%rcx),%xmm6 - - pxor %xmm5,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm12,64(%rax) - movdqa %xmm13,80(%rax) - movdqa %xmm14,96(%rax) - movdqa %xmm15,112(%rax) - leaq 128(%rax),%rax - decl %r10d - jnz .Lkey_loop - - movdqa 80(%r11),%xmm7 - - .byte 0xf3,0xc3 -.size _bsaes_key_convert,.-_bsaes_key_convert -.extern asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.globl bsaes_cbc_encrypt -.hidden bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,@function -.align 16 -bsaes_cbc_encrypt: - cmpl $0,%r9d - jne asm_AES_cbc_encrypt - cmpq $128,%rdx - jb asm_AES_cbc_encrypt - - movq %rsp,%rax -.Lcbc_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movq %r8,%rbx - shrq $4,%r14 - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - movdqu (%rbx),%xmm14 - subq $8,%r14 -.Lcbc_dec_loop: - movdqu 0(%r12),%xmm15 - movdqu 16(%r12),%xmm0 - movdqu 32(%r12),%xmm1 - movdqu 48(%r12),%xmm2 - movdqu 64(%r12),%xmm3 - movdqu 80(%r12),%xmm4 - movq %rsp,%rax - movdqu 96(%r12),%xmm5 - movl %edx,%r10d - movdqu 112(%r12),%xmm6 - movdqa %xmm14,32(%rbp) - - call _bsaes_decrypt8 - - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm2 - movdqu 112(%r12),%xmm14 - pxor %xmm13,%xmm4 - movdqu %xmm15,0(%r13) - leaq 128(%r12),%r12 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - subq $8,%r14 - jnc .Lcbc_dec_loop - - addq $8,%r14 - jz .Lcbc_dec_done - - movdqu 0(%r12),%xmm15 - movq %rsp,%rax - movl %edx,%r10d - cmpq $2,%r14 - jb .Lcbc_dec_one - movdqu 16(%r12),%xmm0 - je .Lcbc_dec_two - movdqu 32(%r12),%xmm1 - cmpq $4,%r14 - jb .Lcbc_dec_three - movdqu 48(%r12),%xmm2 - je .Lcbc_dec_four - movdqu 64(%r12),%xmm3 - cmpq $6,%r14 - jb .Lcbc_dec_five - movdqu 80(%r12),%xmm4 - je .Lcbc_dec_six - movdqu 96(%r12),%xmm5 - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm14 - pxor %xmm12,%xmm2 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_six: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm14 - pxor %xmm11,%xmm6 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_five: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm14 - pxor %xmm10,%xmm1 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_four: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm14 - pxor %xmm9,%xmm3 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_three: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm14 - pxor %xmm8,%xmm5 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_two: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm14 - pxor %xmm7,%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_one: - leaq (%r12),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm14 - movdqu %xmm14,(%r13) - movdqa %xmm15,%xmm14 - -.Lcbc_dec_done: - movdqu %xmm14,(%rbx) - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lcbc_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lcbc_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lcbc_dec_epilogue: - .byte 0xf3,0xc3 -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - -.globl bsaes_ctr32_encrypt_blocks -.hidden bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,@function -.align 16 -bsaes_ctr32_encrypt_blocks: - movq %rsp,%rax -.Lctr_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movdqu (%r8),%xmm0 - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movdqa %xmm0,32(%rbp) - cmpq $8,%rdx - jb .Lctr_enc_short - - movl %eax,%ebx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %ebx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - movdqa (%rsp),%xmm8 - leaq .LADD1(%rip),%r11 - movdqa 32(%rbp),%xmm15 - movdqa -32(%r11),%xmm7 -.byte 102,68,15,56,0,199 -.byte 102,68,15,56,0,255 - movdqa %xmm8,(%rsp) - jmp .Lctr_enc_loop -.align 16 -.Lctr_enc_loop: - movdqa %xmm15,32(%rbp) - movdqa %xmm15,%xmm0 - movdqa %xmm15,%xmm1 - paddd 0(%r11),%xmm0 - movdqa %xmm15,%xmm2 - paddd 16(%r11),%xmm1 - movdqa %xmm15,%xmm3 - paddd 32(%r11),%xmm2 - movdqa %xmm15,%xmm4 - paddd 48(%r11),%xmm3 - movdqa %xmm15,%xmm5 - paddd 64(%r11),%xmm4 - movdqa %xmm15,%xmm6 - paddd 80(%r11),%xmm5 - paddd 96(%r11),%xmm6 - - - - movdqa (%rsp),%xmm8 - leaq 16(%rsp),%rax - movdqa -16(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq .LBS0(%rip),%r11 - movl %ebx,%r10d - - call _bsaes_encrypt8_bitslice - - subq $8,%r14 - jc .Lctr_enc_loop_done - - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - movdqu 32(%r12),%xmm9 - movdqu 48(%r12),%xmm10 - movdqu 64(%r12),%xmm11 - movdqu 80(%r12),%xmm12 - movdqu 96(%r12),%xmm13 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - pxor %xmm15,%xmm7 - movdqa 32(%rbp),%xmm15 - pxor %xmm8,%xmm0 - movdqu %xmm7,0(%r13) - pxor %xmm9,%xmm3 - movdqu %xmm0,16(%r13) - pxor %xmm10,%xmm5 - movdqu %xmm3,32(%r13) - pxor %xmm11,%xmm2 - movdqu %xmm5,48(%r13) - pxor %xmm12,%xmm6 - movdqu %xmm2,64(%r13) - pxor %xmm13,%xmm1 - movdqu %xmm6,80(%r13) - pxor %xmm14,%xmm4 - movdqu %xmm1,96(%r13) - leaq .LADD1(%rip),%r11 - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - paddd 112(%r11),%xmm15 - jnz .Lctr_enc_loop - - jmp .Lctr_enc_done -.align 16 -.Lctr_enc_loop_done: - addq $8,%r14 - movdqu 0(%r12),%xmm7 - pxor %xmm7,%xmm15 - movdqu %xmm15,0(%r13) - cmpq $2,%r14 - jb .Lctr_enc_done - movdqu 16(%r12),%xmm8 - pxor %xmm8,%xmm0 - movdqu %xmm0,16(%r13) - je .Lctr_enc_done - movdqu 32(%r12),%xmm9 - pxor %xmm9,%xmm3 - movdqu %xmm3,32(%r13) - cmpq $4,%r14 - jb .Lctr_enc_done - movdqu 48(%r12),%xmm10 - pxor %xmm10,%xmm5 - movdqu %xmm5,48(%r13) - je .Lctr_enc_done - movdqu 64(%r12),%xmm11 - pxor %xmm11,%xmm2 - movdqu %xmm2,64(%r13) - cmpq $6,%r14 - jb .Lctr_enc_done - movdqu 80(%r12),%xmm12 - pxor %xmm12,%xmm6 - movdqu %xmm6,80(%r13) - je .Lctr_enc_done - movdqu 96(%r12),%xmm13 - pxor %xmm13,%xmm1 - movdqu %xmm1,96(%r13) - jmp .Lctr_enc_done - -.align 16 -.Lctr_enc_short: - leaq 32(%rbp),%rdi - leaq 48(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_encrypt - movdqu (%r12),%xmm0 - leaq 16(%r12),%r12 - movl 44(%rbp),%eax - bswapl %eax - pxor 48(%rbp),%xmm0 - incl %eax - movdqu %xmm0,(%r13) - bswapl %eax - leaq 16(%r13),%r13 - movl %eax,44(%rsp) - decq %r14 - jnz .Lctr_enc_short - -.Lctr_enc_done: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lctr_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lctr_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lctr_enc_epilogue: - .byte 0xf3,0xc3 -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.hidden bsaes_xts_encrypt -.type bsaes_xts_encrypt,@function -.align 16 -bsaes_xts_encrypt: - movq %rsp,%rax -.Lxts_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - andq $-16,%r14 - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc .Lxts_enc_short - jmp .Lxts_enc_loop - -.align 16 -.Lxts_enc_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm1,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc .Lxts_enc_loop - -.Lxts_enc_short: - addq $0x80,%r14 - jz .Lxts_enc_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je .Lxts_enc_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je .Lxts_enc_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je .Lxts_enc_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je .Lxts_enc_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je .Lxts_enc_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je .Lxts_enc_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - movdqu %xmm1,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - movdqu %xmm2,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - movdqu %xmm5,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - movdqu %xmm3,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_encrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -.Lxts_enc_done: - andl $15,%ebx - jz .Lxts_enc_ret - movq %r13,%rdx - -.Lxts_enc_steal: - movzbl (%r12),%eax - movzbl -16(%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,-16(%rdx) - movb %cl,0(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz .Lxts_enc_steal - - movdqu -16(%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call asm_AES_encrypt - pxor 32(%rbp),%xmm6 - movdqu %xmm6,-16(%r13) - -.Lxts_enc_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lxts_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lxts_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lxts_enc_epilogue: - .byte 0xf3,0xc3 -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.hidden bsaes_xts_decrypt -.type bsaes_xts_decrypt,@function -.align 16 -bsaes_xts_decrypt: - movq %rsp,%rax -.Lxts_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - xorl %eax,%eax - andq $-16,%r14 - testl $15,%ebx - setnz %al - shlq $4,%rax - subq %rax,%r14 - - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc .Lxts_dec_short - jmp .Lxts_dec_loop - -.align 16 -.Lxts_dec_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc .Lxts_dec_loop - -.Lxts_dec_short: - addq $0x80,%r14 - jz .Lxts_dec_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je .Lxts_dec_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je .Lxts_dec_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je .Lxts_dec_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je .Lxts_dec_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je .Lxts_dec_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je .Lxts_dec_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -.Lxts_dec_done: - andl $15,%ebx - jz .Lxts_dec_ret - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - movdqa %xmm6,%xmm5 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - movdqu (%r12),%xmm15 - pxor %xmm13,%xmm6 - - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm6 - movq %r13,%rdx - movdqu %xmm6,(%r13) - -.Lxts_dec_steal: - movzbl 16(%r12),%eax - movzbl (%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,(%rdx) - movb %cl,16(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz .Lxts_dec_steal - - movdqu (%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm5,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call asm_AES_decrypt - pxor 32(%rbp),%xmm5 - movdqu %xmm5,(%r13) - -.Lxts_dec_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lxts_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lxts_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -.Lxts_dec_epilogue: - .byte 0xf3,0xc3 -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt -.type _bsaes_const,@object -.align 64 -_bsaes_const: -.LM0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -.LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -.LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -.LBS0: -.quad 0x5555555555555555, 0x5555555555555555 -.LBS1: -.quad 0x3333333333333333, 0x3333333333333333 -.LBS2: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -.LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -.LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -.LM0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LSWPUP: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -.LSWPUPM0SR: -.quad 0x0a0d02060c03070b, 0x0004080f05090e01 -.LADD1: -.quad 0x0000000000000000, 0x0000000100000000 -.LADD2: -.quad 0x0000000000000000, 0x0000000200000000 -.LADD3: -.quad 0x0000000000000000, 0x0000000300000000 -.LADD4: -.quad 0x0000000000000000, 0x0000000400000000 -.LADD5: -.quad 0x0000000000000000, 0x0000000500000000 -.LADD6: -.quad 0x0000000000000000, 0x0000000600000000 -.LADD7: -.quad 0x0000000000000000, 0x0000000700000000 -.LADD8: -.quad 0x0000000000000000, 0x0000000800000000 -.Lxts_magic: -.long 0x87,0,1,0 -.Lmasks: -.quad 0x0101010101010101, 0x0101010101010101 -.quad 0x0202020202020202, 0x0202020202020202 -.quad 0x0404040404040404, 0x0404040404040404 -.quad 0x0808080808080808, 0x0808080808080808 -.LM0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -.L63: -.quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 -.align 64 -.size _bsaes_const,.-_bsaes_const -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S new file mode 100644 index 0000000000..a44790b169 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S @@ -0,0 +1,427 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + +.type gcm_gmult_ssse3, @function +.globl gcm_gmult_ssse3 +.hidden gcm_gmult_ssse3 +.align 16 +gcm_gmult_ssse3: +.cfi_startproc +.Lgmult_seh_begin: + movdqu (%rdi),%xmm0 + movdqa .Lreverse_bytes(%rip),%xmm10 + movdqa .Llow4_mask(%rip),%xmm2 + + +.byte 102,65,15,56,0,194 + + + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + + + + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_1: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_1 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_2: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_2 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +.Loop_row_3: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_3 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + +.byte 102,65,15,56,0,210 + movdqu %xmm2,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.Lgmult_seh_end: +.cfi_endproc +.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 + + + + + +.type gcm_ghash_ssse3, @function +.globl gcm_ghash_ssse3 +.hidden gcm_ghash_ssse3 +.align 16 +gcm_ghash_ssse3: +.Lghash_seh_begin: +.cfi_startproc + movdqu (%rdi),%xmm0 + movdqa .Lreverse_bytes(%rip),%xmm10 + movdqa .Llow4_mask(%rip),%xmm11 + + + andq $-16,%rcx + + + +.byte 102,65,15,56,0,194 + + + pxor %xmm3,%xmm3 +.Loop_ghash: + + movdqu (%rdx),%xmm1 +.byte 102,65,15,56,0,202 + pxor %xmm1,%xmm0 + + + movdqa %xmm11,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm11,%xmm0 + + + + + pxor %xmm2,%xmm2 + + movq $5,%rax +.Loop_row_4: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_4 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_5: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_5 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +.Loop_row_6: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_6 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + + + leaq -256(%rsi),%rsi + + + leaq 16(%rdx),%rdx + subq $16,%rcx + jnz .Loop_ghash + + +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.Lghash_seh_end: +.cfi_endproc +.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 + +.align 16 + + +.Lreverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +.Llow4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S index 64ef2c2db2..674e2dabed 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8,13 +20,27 @@ .type gcm_gmult_4bit,@function .align 16 gcm_gmult_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -92,23 +118,41 @@ gcm_gmult_4bit: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit .globl gcm_ghash_4bit .hidden gcm_ghash_4bit .type gcm_ghash_4bit,@function .align 16 gcm_ghash_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -654,21 +698,31 @@ gcm_ghash_4bit: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit .globl gcm_init_clmul .hidden gcm_init_clmul .type gcm_init_clmul,@function .align 16 gcm_init_clmul: +.cfi_startproc .L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -820,12 +874,14 @@ gcm_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .hidden gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: +.cfi_startproc .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -872,12 +928,14 @@ gcm_gmult_clmul: .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .hidden gcm_ghash_clmul .type gcm_ghash_clmul,@function .align 32 gcm_ghash_clmul: +.cfi_startproc .L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm10 @@ -1257,12 +1315,14 @@ gcm_ghash_clmul: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul .globl gcm_init_avx .hidden gcm_init_avx .type gcm_init_avx,@function .align 32 gcm_init_avx: +.cfi_startproc vzeroupper vmovdqu (%rsi),%xmm2 @@ -1365,19 +1425,23 @@ gcm_init_avx: vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx .hidden gcm_gmult_avx .type gcm_gmult_avx,@function .align 32 gcm_gmult_avx: +.cfi_startproc jmp .L_gmult_clmul +.cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx .globl gcm_ghash_avx .hidden gcm_ghash_avx .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: +.cfi_startproc vzeroupper vmovdqu (%rdi),%xmm10 @@ -1749,6 +1813,7 @@ gcm_ghash_avx: vmovdqu %xmm10,(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: @@ -1804,3 +1869,4 @@ gcm_ghash_avx: .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S index 8af65047aa..04aaf057e6 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .align 16 @@ -6,11 +18,22 @@ .hidden md5_block_asm_data_order .type md5_block_asm_data_order,@function md5_block_asm_data_order: +.cfi_startproc pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-40 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-48 .Lprologue: @@ -660,12 +683,20 @@ md5_block_asm_data_order: movl %edx,12(%rbp) movq (%rsp),%r15 +.cfi_restore r15 movq 8(%rsp),%r14 +.cfi_restore r14 movq 16(%rsp),%r12 +.cfi_restore r12 movq 24(%rsp),%rbx +.cfi_restore rbx movq 32(%rsp),%rbp +.cfi_restore rbp addq $40,%rsp +.cfi_adjust_cfa_offset -40 .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size md5_block_asm_data_order,.-md5_block_asm_data_order #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 6d21888f04..85f4899012 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -18,14 +30,26 @@ .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f + + .globl ecp_nistz256_neg .hidden ecp_nistz256_neg .type ecp_nistz256_neg,@function .align 32 ecp_nistz256_neg: +.cfi_startproc pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lneg_body: xorq %r8,%r8 xorq %r9,%r9 @@ -59,9 +83,15 @@ ecp_nistz256_neg: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg @@ -69,18 +99,1130 @@ ecp_nistz256_neg: +.globl ecp_nistz256_ord_mul_mont +.hidden ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,@function +.align 32 +ecp_nistz256_ord_mul_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_mul_montx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq .Lord(%rip),%r14 + movq .LordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + + + + + + + +.globl ecp_nistz256_ord_sqr_mont +.hidden ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,@function +.align 32 +ecp_nistz256_ord_sqr_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_sqr_montx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq .Lord(%rip),%rsi + movq %rdx,%rbx + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz .Loop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont + +.type ecp_nistz256_ord_mul_montx,@function +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,@function +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx + + + + + + .globl ecp_nistz256_mul_mont .hidden ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,@function .align 32 ecp_nistz256_mul_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx .Lmul_mont: pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmul_body: + cmpl $0x80100,%ecx + je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -89,19 +1231,43 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp .Lmul_mont_done + +.align 32 +.Lmul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx .Lmul_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,@function .align 32 __ecp_nistz256_mul_montq: +.cfi_startproc movq %rax,%rbp @@ -313,6 +1479,7 @@ __ecp_nistz256_mul_montq: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq @@ -327,31 +1494,72 @@ __ecp_nistz256_mul_montq: .type ecp_nistz256_sqr_mont,@function .align 32 ecp_nistz256_sqr_mont: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqr_body: + cmpl $0x80100,%ecx + je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp .Lsqr_mont_done + +.align 32 +.Lsqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx .Lsqr_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,@function .align 32 __ecp_nistz256_sqr_montq: +.cfi_startproc movq %rax,%r13 mulq %r14 movq %rax,%r9 @@ -509,7 +1717,306 @@ __ecp_nistz256_sqr_montq: movq %r15,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx .globl ecp_nistz256_select_w5 @@ -517,6 +2024,7 @@ __ecp_nistz256_sqr_montq: .type ecp_nistz256_select_w5,@function .align 32 ecp_nistz256_select_w5: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -572,6 +2080,8 @@ ecp_nistz256_select_w5: movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5: .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 @@ -581,6 +2091,7 @@ ecp_nistz256_select_w5: .type ecp_nistz256_select_w7,@function .align 32 ecp_nistz256_select_w7: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -625,12 +2136,15 @@ ecp_nistz256_select_w7: movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7: .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 .type ecp_nistz256_avx2_select_w5,@function .align 32 ecp_nistz256_avx2_select_w5: +.cfi_startproc .Lavx2_select_w5: vzeroupper vmovdqa .LTwo(%rip),%ymm0 @@ -685,6 +2199,8 @@ ecp_nistz256_avx2_select_w5: vmovdqu %ymm4,64(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_select_w5: .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 @@ -694,6 +2210,7 @@ ecp_nistz256_avx2_select_w5: .type ecp_nistz256_avx2_select_w7,@function .align 32 ecp_nistz256_avx2_select_w7: +.cfi_startproc .Lavx2_select_w7: vzeroupper vmovdqa .LThree(%rip),%ymm0 @@ -763,10 +2280,13 @@ ecp_nistz256_avx2_select_w7: vmovdqu %ymm3,32(%rdi) vzeroupper .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_select_w7: .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 .type __ecp_nistz256_add_toq,@function .align 32 __ecp_nistz256_add_toq: +.cfi_startproc xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 @@ -794,11 +2314,13 @@ __ecp_nistz256_add_toq: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,@function .align 32 __ecp_nistz256_sub_fromq: +.cfi_startproc subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax @@ -825,11 +2347,13 @@ __ecp_nistz256_sub_fromq: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,@function .align 32 __ecp_nistz256_subq: +.cfi_startproc subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 @@ -852,11 +2376,13 @@ __ecp_nistz256_subq: cmovnzq %r10,%r9 .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,@function .align 32 __ecp_nistz256_mul_by_2q: +.cfi_startproc xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 @@ -884,19 +2410,40 @@ __ecp_nistz256_mul_by_2q: movq %r9,24(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q .globl ecp_nistz256_point_double .hidden ecp_nistz256_point_double .type ecp_nistz256_point_double,@function .align 32 ecp_nistz256_point_double: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_doublex pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doubleq_body: .Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 @@ -1079,27 +2626,58 @@ ecp_nistz256_point_double: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doubleq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_double,.-ecp_nistz256_point_double .globl ecp_nistz256_point_add .hidden ecp_nistz256_point_add .type ecp_nistz256_point_add,@function .align 32 ecp_nistz256_point_add: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_addx pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 @@ -1245,15 +2823,22 @@ ecp_nistz256_point_add: orq %r8,%r12 orq %r9,%r12 -.byte 0x3e - jnz .Ladd_proceedq .byte 102,73,15,126,208 .byte 102,73,15,126,217 - testq %r8,%r8 + orq %r8,%r12 +.byte 0x3e jnz .Ladd_proceedq + + + testq %r9,%r9 jz .Ladd_doubleq + + + + + .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) @@ -1269,7 +2854,9 @@ ecp_nistz256_point_add: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp +.cfi_adjust_cfa_offset -416 jmp .Lpoint_double_shortcutq +.cfi_adjust_cfa_offset 416 .align 32 .Ladd_proceedq: @@ -1476,27 +3063,58 @@ ecp_nistz256_point_add: movdqu %xmm3,48(%rdi) .Ladd_doneq: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add,.-ecp_nistz256_point_add .globl ecp_nistz256_point_add_affine .hidden ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,@function .align 32 ecp_nistz256_point_add_affine: +.cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_add_affinex pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx @@ -1779,13 +3397,1147 @@ ecp_nistz256_point_add_affine: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affineq_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.type __ecp_nistz256_add_tox,@function +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,@function +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,@function +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,@function +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.type ecp_nistz256_point_doublex,@function +.align 32 +ecp_nistz256_point_doublex: +.cfi_startproc +.Lpoint_doublex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: + +.Lpoint_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex +.type ecp_nistz256_point_addx,@function +.align 32 +ecp_nistz256_point_addx: +.cfi_startproc +.Lpoint_addx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz .Ladd_proceedx + + + + testq %r9,%r9 + jz .Ladd_doublex + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_donex + +.align 32 +.Ladd_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutx +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_donex: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx +.type ecp_nistz256_point_add_affinex,@function +.align 32 +ecp_nistz256_point_add_affinex: +.cfi_startproc +.Lpoint_add_affinex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S new file mode 100644 index 0000000000..d072a83479 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S @@ -0,0 +1,343 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + +.type beeu_mod_inverse_vartime,@function +.hidden beeu_mod_inverse_vartime +.globl beeu_mod_inverse_vartime +.hidden beeu_mod_inverse_vartime +.align 32 +beeu_mod_inverse_vartime: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-16 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-24 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-32 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-40 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-48 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 +.cfi_offset rsi,-64 + + subq $80,%rsp +.cfi_adjust_cfa_offset 80 + movq %rdi,0(%rsp) + + + movq $1,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %rdi,%rdi + + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + xorq %rbp,%rbp + + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu %xmm0,48(%rsp) + vmovdqu %xmm1,64(%rsp) + + vmovdqu 0(%rdx),%xmm0 + vmovdqu 16(%rdx),%xmm1 + vmovdqu %xmm0,16(%rsp) + vmovdqu %xmm1,32(%rsp) + +.Lbeeu_loop: + xorq %rbx,%rbx + orq 48(%rsp),%rbx + orq 56(%rsp),%rbx + orq 64(%rsp),%rbx + orq 72(%rsp),%rbx + jz .Lbeeu_loop_end + + + + + + + + + + + movq $1,%rcx + + +.Lbeeu_shift_loop_XB: + movq %rcx,%rbx + andq 48(%rsp),%rbx + jnz .Lbeeu_shift_loop_end_XB + + + movq $1,%rbx + andq %r8,%rbx + jz .Lshift1_0 + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq $0,%rdi + +.Lshift1_0: + shrdq $1,%r9,%r8 + shrdq $1,%r10,%r9 + shrdq $1,%r11,%r10 + shrdq $1,%rdi,%r11 + shrq $1,%rdi + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne .Lbeeu_shift_loop_XB + +.Lbeeu_shift_loop_end_XB: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz .Lbeeu_no_shift_XB + + + + movq 8+48(%rsp),%rax + movq 16+48(%rsp),%rbx + movq 24+48(%rsp),%rsi + + shrdq %cl,%rax,0+48(%rsp) + shrdq %cl,%rbx,8+48(%rsp) + shrdq %cl,%rsi,16+48(%rsp) + + shrq %cl,%rsi + movq %rsi,24+48(%rsp) + + +.Lbeeu_no_shift_XB: + + movq $1,%rcx + + +.Lbeeu_shift_loop_YA: + movq %rcx,%rbx + andq 16(%rsp),%rbx + jnz .Lbeeu_shift_loop_end_YA + + + movq $1,%rbx + andq %r12,%rbx + jz .Lshift1_1 + addq 0(%rdx),%r12 + adcq 8(%rdx),%r13 + adcq 16(%rdx),%r14 + adcq 24(%rdx),%r15 + adcq $0,%rbp + +.Lshift1_1: + shrdq $1,%r13,%r12 + shrdq $1,%r14,%r13 + shrdq $1,%r15,%r14 + shrdq $1,%rbp,%r15 + shrq $1,%rbp + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne .Lbeeu_shift_loop_YA + +.Lbeeu_shift_loop_end_YA: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz .Lbeeu_no_shift_YA + + + + movq 8+16(%rsp),%rax + movq 16+16(%rsp),%rbx + movq 24+16(%rsp),%rsi + + shrdq %cl,%rax,0+16(%rsp) + shrdq %cl,%rbx,8+16(%rsp) + shrdq %cl,%rsi,16+16(%rsp) + + shrq %cl,%rsi + movq %rsi,24+16(%rsp) + + +.Lbeeu_no_shift_YA: + + movq 48(%rsp),%rax + movq 56(%rsp),%rbx + movq 64(%rsp),%rsi + movq 72(%rsp),%rcx + subq 16(%rsp),%rax + sbbq 24(%rsp),%rbx + sbbq 32(%rsp),%rsi + sbbq 40(%rsp),%rcx + jnc .Lbeeu_B_bigger_than_A + + + movq 16(%rsp),%rax + movq 24(%rsp),%rbx + movq 32(%rsp),%rsi + movq 40(%rsp),%rcx + subq 48(%rsp),%rax + sbbq 56(%rsp),%rbx + sbbq 64(%rsp),%rsi + sbbq 72(%rsp),%rcx + movq %rax,16(%rsp) + movq %rbx,24(%rsp) + movq %rsi,32(%rsp) + movq %rcx,40(%rsp) + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + adcq %rdi,%rbp + jmp .Lbeeu_loop + +.Lbeeu_B_bigger_than_A: + + movq %rax,48(%rsp) + movq %rbx,56(%rsp) + movq %rsi,64(%rsp) + movq %rcx,72(%rsp) + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rbp,%rdi + + jmp .Lbeeu_loop + +.Lbeeu_loop_end: + + + + + movq 16(%rsp),%rbx + subq $1,%rbx + orq 24(%rsp),%rbx + orq 32(%rsp),%rbx + orq 40(%rsp),%rbx + + jnz .Lbeeu_err + + + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + xorq %rdi,%rdi + +.Lbeeu_reduction_loop: + movq %r12,16(%rsp) + movq %r13,24(%rsp) + movq %r14,32(%rsp) + movq %r15,40(%rsp) + movq %rbp,48(%rsp) + + + subq %r8,%r12 + sbbq %r9,%r13 + sbbq %r10,%r14 + sbbq %r11,%r15 + sbbq $0,%rbp + + + cmovcq 16(%rsp),%r12 + cmovcq 24(%rsp),%r13 + cmovcq 32(%rsp),%r14 + cmovcq 40(%rsp),%r15 + jnc .Lbeeu_reduction_loop + + + subq %r12,%r8 + sbbq %r13,%r9 + sbbq %r14,%r10 + sbbq %r15,%r11 + +.Lbeeu_save: + + movq 0(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + + movq $1,%rax + jmp .Lbeeu_finish + +.Lbeeu_err: + + xorq %rax,%rax + +.Lbeeu_finish: + addq $80,%rsp +.cfi_adjust_cfa_offset -80 + popq %rsi +.cfi_adjust_cfa_offset -8 +.cfi_restore rsi + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore rbx + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore rbp + .byte 0xf3,0xc3 +.cfi_endproc + +.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S index 7c1eeb7211..18d66f6f7f 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -9,14 +21,15 @@ .type CRYPTO_rdrand,@function .align 16 CRYPTO_rdrand: +.cfi_startproc xorq %rax,%rax - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,242 adcq %rax,%rax - movq %rcx,0(%rdi) + movq %rdx,0(%rdi) .byte 0xf3,0xc3 +.cfi_endproc +.size CRYPTO_rdrand,.-CRYPTO_rdrand @@ -27,13 +40,12 @@ CRYPTO_rdrand: .type CRYPTO_rdrand_multiple8_buf,@function .align 16 CRYPTO_rdrand_multiple8_buf: +.cfi_startproc testq %rsi,%rsi jz .Lout movq $8,%rdx .Lloop: - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,241 jnc .Lerr movq %rcx,0(%rdi) addq %rdx,%rdi @@ -45,4 +57,7 @@ CRYPTO_rdrand_multiple8_buf: .Lerr: xorq %rax,%rax .byte 0xf3,0xc3 +.cfi_endproc +.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S index bc3440d55c..faccd484b0 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl rsaz_1024_sqr_avx2 @@ -77,7 +89,7 @@ rsaz_1024_sqr_avx2: vmovdqu 256-128(%rsi),%ymm8 leaq 192(%rsp),%rbx - vpbroadcastq .Land_mask(%rip),%ymm15 + vmovdqu .Land_mask(%rip),%ymm15 jmp .LOOP_GRANDE_SQR_1024 .align 32 @@ -829,10 +841,10 @@ rsaz_1024_mul_avx2: vpmuludq 192-128(%rcx),%ymm11,%ymm12 vpaddq %ymm12,%ymm6,%ymm6 vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 + vpblendd $3,%ymm14,%ymm9,%ymm12 vpaddq %ymm13,%ymm7,%ymm7 vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm12,%ymm3,%ymm3 vpaddq %ymm0,%ymm8,%ymm8 movq %rbx,%rax @@ -845,7 +857,9 @@ rsaz_1024_mul_avx2: vmovdqu -8+64-128(%rsi),%ymm13 movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 andl $0x1fffffff,%eax imulq 16-128(%rsi),%rbx @@ -1074,7 +1088,6 @@ rsaz_1024_mul_avx2: decl %r14d jnz .Loop_mul_1024 - vpermq $0,%ymm15,%ymm15 vpaddq (%rsp),%ymm12,%ymm0 vpsrlq $29,%ymm0,%ymm12 @@ -1215,6 +1228,7 @@ rsaz_1024_mul_avx2: .type rsaz_1024_red2norm_avx2,@function .align 32 rsaz_1024_red2norm_avx2: +.cfi_startproc subq $-128,%rsi xorq %rax,%rax movq -128(%rsi),%r8 @@ -1406,6 +1420,7 @@ rsaz_1024_red2norm_avx2: movq %rax,120(%rdi) movq %r11,%rax .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 .globl rsaz_1024_norm2red_avx2 @@ -1413,6 +1428,7 @@ rsaz_1024_red2norm_avx2: .type rsaz_1024_norm2red_avx2,@function .align 32 rsaz_1024_norm2red_avx2: +.cfi_startproc subq $-128,%rdi movq (%rsi),%r8 movl $0x1fffffff,%eax @@ -1565,12 +1581,14 @@ rsaz_1024_norm2red_avx2: movq %r8,176(%rdi) movq %r8,184(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 .globl rsaz_1024_scatter5_avx2 .hidden rsaz_1024_scatter5_avx2 .type rsaz_1024_scatter5_avx2,@function .align 32 rsaz_1024_scatter5_avx2: +.cfi_startproc vzeroupper vmovdqu .Lscatter_permd(%rip),%ymm5 shll $4,%edx @@ -1590,6 +1608,7 @@ rsaz_1024_scatter5_avx2: vzeroupper .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 .globl rsaz_1024_gather5_avx2 @@ -1714,23 +1733,9 @@ rsaz_1024_gather5_avx2: .cfi_endproc .LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl rsaz_avx2_eligible -.hidden rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -.align 32 -rsaz_avx2_eligible: - leaq OPENSSL_ia32cap_P(%rip),%rax - movl 8(%rax),%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - .align 64 .Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff .Lscatter_permd: .long 0,2,4,6,7,7,7,7 .Lgather_permd: @@ -1741,3 +1746,4 @@ rsaz_avx2_eligible: .long 4,4,4,4, 4,4,4,4 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S index 7f924dcc1e..a4ce81ff91 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8,6 +20,7 @@ .type sha1_block_data_order,@function .align 16 sha1_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r10 movl 0(%r10),%r9d movl 4(%r10),%r8d @@ -24,17 +37,24 @@ sha1_block_data_order: .align 16 .Lialu: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%r8),%esi @@ -1229,25 +1249,40 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: +.cfi_startproc movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp andq $-64,%rsp movq %rdi,%r8 @@ -2404,24 +2439,38 @@ _ssse3_shortcut: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 +.cfi_restore %r14 movq -32(%r11),%r13 +.cfi_restore %r13 movq -24(%r11),%r12 +.cfi_restore %r12 movq -16(%r11),%rbp +.cfi_restore %rbp movq -8(%r11),%rbx +.cfi_restore %rbx leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .type sha1_block_data_order_avx,@function .align 16 sha1_block_data_order_avx: _avx_shortcut: +.cfi_startproc movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp vzeroupper andq $-64,%rsp @@ -3518,13 +3567,20 @@ _avx_shortcut: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 +.cfi_restore %r14 movq -32(%r11),%r13 +.cfi_restore %r13 movq -24(%r11),%r12 +.cfi_restore %r12 movq -16(%r11),%rbp +.cfi_restore %rbp movq -8(%r11),%rbx +.cfi_restore %rbx leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx .align 64 K_XX_XX: @@ -3542,3 +3598,4 @@ K_XX_XX: .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S index 62534be495..0bacd6a4a8 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -8,6 +20,7 @@ .type sha256_block_data_order,@function .align 16 sha256_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -20,12 +33,19 @@ sha256_block_data_order: testl $512,%r10d jnz .Lssse3_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -33,7 +53,8 @@ sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%rdi),%eax @@ -1697,16 +1718,25 @@ sha256_block_data_order: movl %r11d,28(%rdi) jb .Lloop - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order,.-sha256_block_data_order .align 64 .type K256,@object @@ -1754,14 +1784,22 @@ K256: .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: +.cfi_startproc .Lssse3_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1769,7 +1807,8 @@ sha256_block_data_order_ssse3: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_ssse3: movl 0(%rdi),%eax @@ -2835,28 +2874,45 @@ sha256_block_data_order_ssse3: movl %r11d,28(%rdi) jb .Lloop_ssse3 - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 .type sha256_block_data_order_avx,@function .align 64 sha256_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2864,7 +2920,8 @@ sha256_block_data_order_avx: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -3891,16 +3948,26 @@ sha256_block_data_order_avx: movl %r11d,28(%rdi) jb .Lloop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S index 1f1793bb0f..afc47f139b 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -8,24 +20,30 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $2048,%r10d - jnz .Lxop_shortcut andl $1073741824,%r9d andl $268435968,%r10d orl %r9d,%r10d cmpl $1342177792,%r10d je .Lavx_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +51,8 @@ sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: movq 0(%rdi),%rax @@ -1697,16 +1716,25 @@ sha512_block_data_order: movq %r11,56(%rdi) jb .Lloop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order,.-sha512_block_data_order .align 64 .type K512,@object @@ -1795,1100 +1823,25 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha512_block_data_order_xop,@function -.align 64 -sha512_block_data_order_xop: -.Lxop_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) -.Lprologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lxop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_xop - - movq 128+24(%rsp),%rsi - vzeroupper - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.size sha512_block_data_order_xop,.-sha512_block_data_order_xop .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +1849,8 @@ sha512_block_data_order_avx: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -4013,16 +2967,26 @@ sha512_block_data_order_avx: movq %r11,56(%rdi) jb .Lloop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S index f3a089de9c..27a34617a3 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -19,6 +31,7 @@ .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: +.cfi_startproc movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -99,6 +112,7 @@ _vpaes_encrypt_core: pxor %xmm4,%xmm0 .byte 102,15,56,0,193 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_encrypt_core,.-_vpaes_encrypt_core @@ -106,9 +120,185 @@ _vpaes_encrypt_core: + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core_2x,@function +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa .Lk_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + + movdqa .Lk_sb1(%rip),%xmm4 + movdqa .Lk_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa .Lk_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa .Lk_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +.Lenc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz .Lenc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + .byte 0xf3,0xc3 +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + + + + + + .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: +.cfi_startproc movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -205,6 +395,7 @@ _vpaes_decrypt_core: pxor %xmm4,%xmm0 .byte 102,15,56,0,194 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_decrypt_core,.-_vpaes_decrypt_core @@ -215,6 +406,7 @@ _vpaes_decrypt_core: .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: +.cfi_startproc @@ -381,6 +573,7 @@ _vpaes_schedule_core: pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -400,6 +593,7 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: +.cfi_startproc pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -408,6 +602,7 @@ _vpaes_schedule_192_smear: movdqa %xmm6,%xmm0 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear @@ -431,6 +626,7 @@ _vpaes_schedule_192_smear: .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: +.cfi_startproc pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 @@ -484,6 +680,7 @@ _vpaes_schedule_low_round: pxor %xmm7,%xmm0 movdqa %xmm0,%xmm7 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_round,.-_vpaes_schedule_round @@ -498,6 +695,7 @@ _vpaes_schedule_low_round: .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: +.cfi_startproc movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -508,6 +706,7 @@ _vpaes_schedule_transform: .byte 102,15,56,0,193 pxor %xmm2,%xmm0 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_transform,.-_vpaes_schedule_transform @@ -536,6 +735,7 @@ _vpaes_schedule_transform: .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: +.cfi_startproc movdqa %xmm0,%xmm4 movdqa .Lk_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -600,6 +800,7 @@ _vpaes_schedule_mangle: andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -610,6 +811,13 @@ _vpaes_schedule_mangle: .type vpaes_set_encrypt_key,@function .align 16 vpaes_set_encrypt_key: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+5(%rip) +#endif + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -620,6 +828,7 @@ vpaes_set_encrypt_key: call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key .globl vpaes_set_decrypt_key @@ -627,6 +836,7 @@ vpaes_set_encrypt_key: .type vpaes_set_decrypt_key,@function .align 16 vpaes_set_decrypt_key: +.cfi_startproc movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -642,6 +852,7 @@ vpaes_set_decrypt_key: call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key .globl vpaes_encrypt @@ -649,11 +860,18 @@ vpaes_set_decrypt_key: .type vpaes_encrypt,@function .align 16 vpaes_encrypt: +.cfi_startproc +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+4(%rip) +#endif movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_encrypt,.-vpaes_encrypt .globl vpaes_decrypt @@ -661,17 +879,20 @@ vpaes_encrypt: .type vpaes_decrypt,@function .align 16 vpaes_decrypt: +.cfi_startproc movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_decrypt,.-vpaes_decrypt .globl vpaes_cbc_encrypt .hidden vpaes_cbc_encrypt .type vpaes_cbc_encrypt,@function .align 16 vpaes_cbc_encrypt: +.cfi_startproc xchgq %rcx,%rdx subq $16,%rcx jc .Lcbc_abort @@ -707,7 +928,71 @@ vpaes_cbc_encrypt: movdqu %xmm6,(%r8) .Lcbc_abort: .byte 0xf3,0xc3 +.cfi_endproc .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,@function +.align 16 +vpaes_ctr32_encrypt_blocks: +.cfi_startproc + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz .Lctr32_abort + movdqu (%r8),%xmm0 + movdqa .Lctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb .Lrev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz .Lctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz .Lctr32_done + +.Lctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa .Lctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz .Lctr32_loop + +.Lctr32_done: +.Lctr32_abort: + .byte 0xf3,0xc3 +.cfi_endproc +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks @@ -717,6 +1002,7 @@ vpaes_cbc_encrypt: .type _vpaes_preheat,@function .align 16 _vpaes_preheat: +.cfi_startproc leaq .Lk_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -726,6 +1012,7 @@ _vpaes_preheat: movdqa 80(%r10),%xmm15 movdqa 96(%r10),%xmm14 .byte 0xf3,0xc3 +.cfi_endproc .size _vpaes_preheat,.-_vpaes_preheat @@ -828,7 +1115,19 @@ _vpaes_consts: .Lk_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +.Lrev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +.Lctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S index b32e2f0ef4..bdb4454212 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -17,6 +29,8 @@ bn_mul_mont: jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d @@ -208,31 +222,30 @@ bn_mul_mont: xorq %r14,%r14 movq (%rsp),%rax - leaq (%rsp),%rsi movq %r9,%r15 - jmp .Lsub + .align 16 -.Lsub: - sbbq (%rcx,%r14,8),%rax +.Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax + movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz .Lsub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.align 16 + .Lcopy: - movq (%rsi,%r14,8),%rax - movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy @@ -266,6 +279,9 @@ bn_mul4x_mont: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -603,7 +619,6 @@ bn_mul4x_mont: movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax - pxor %xmm0,%xmm0 movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi @@ -613,8 +628,7 @@ bn_mul4x_mont: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - jmp .Lsub4x -.align 16 + .Lsub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) @@ -641,34 +655,35 @@ bn_mul4x_mont: sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) - xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx - leaq -4(%r9),%r15 - orq %rcx,%rsi + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 shrq $2,%r15 + xorl %eax,%eax - movdqu (%rsi),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,(%rdi) jmp .Lcopy4x .align 16 .Lcopy4x: - movdqu 16(%rsi,%r14,1),%xmm2 - movdqu 32(%rsi,%r14,1),%xmm1 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movdqa %xmm0,32(%rsp,%r14,1) - movdqu %xmm1,32(%rdi,%r14,1) - leaq 32(%r14),%r14 + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax decq %r15 jnz .Lcopy4x - - movdqu 16(%rsi,%r14,1),%xmm2 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi, 8 movq $1,%rax @@ -690,6 +705,8 @@ bn_mul4x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont +.extern bn_sqrx8x_internal +.hidden bn_sqrx8x_internal .extern bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -774,6 +791,26 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + leaq OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: call bn_sqr8x_internal @@ -861,6 +898,363 @@ bn_sqr8x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S index 208b1dca3e..c86b3b0a59 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .extern OPENSSL_ia32cap_P @@ -15,6 +27,8 @@ bn_mul_mont_gather5: .cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d jmp .Lmul4x_enter .align 16 @@ -396,8 +410,7 @@ bn_mul_mont_gather5: movq %r9,%r15 jmp .Lsub .align 16 -.Lsub: - sbbq (%rcx,%r14,8),%rax +.Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -405,18 +418,19 @@ bn_mul_mont_gather5: jnz .Lsub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.align 16 + .Lcopy: - movq (%rsi,%r14,8),%rax + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy @@ -451,6 +465,9 @@ bn_mul4x_mont_gather5: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -549,6 +566,7 @@ bn_mul4x_mont_gather5: .type mul4x_internal,@function .align 32 mul4x_internal: +.cfi_startproc shlq $5,%r9 movd 8(%rax),%xmm5 leaq .Linc(%rip),%rax @@ -1070,6 +1088,7 @@ mul4x_internal: movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry +.cfi_endproc .size mul4x_internal,.-mul4x_internal .globl bn_power5 .hidden bn_power5 @@ -1079,6 +1098,11 @@ bn_power5: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1210,6 +1234,7 @@ bn_power5: .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: +.cfi_startproc @@ -1984,10 +2009,12 @@ __bn_sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop .byte 0xf3,0xc3 +.cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal .type __bn_post4x_internal,@function .align 32 __bn_post4x_internal: +.cfi_startproc movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx @@ -2038,16 +2065,19 @@ __bn_post4x_internal: movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 +.cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .hidden bn_from_montgomery .type bn_from_montgomery,@function .align 32 bn_from_montgomery: +.cfi_startproc testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax .byte 0xf3,0xc3 +.cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,@function @@ -2164,6 +2194,22 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne .Lfrom_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2202,11 +2248,1356 @@ bn_from_mont8x: .byte 0xf3,0xc3 .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: +.cfi_startproc + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +.type __bn_postx4x_internal,@function +__bn_postx4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_scatter5 .hidden bn_scatter5 .type bn_scatter5,@function .align 16 bn_scatter5: +.cfi_startproc cmpl $0,%esi jz .Lscatter_epilogue leaq (%rdx,%rcx,8),%rdx @@ -2219,6 +3610,7 @@ bn_scatter5: jnz .Lscatter .Lscatter_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 @@ -2226,9 +3618,11 @@ bn_scatter5: .type bn_gather5,@function .align 32 bn_gather5: +.cfi_startproc .LSEH_begin_bn_gather5: .byte 0x4c,0x8d,0x14,0x24 +.cfi_def_cfa_register %r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq .Linc(%rip),%rax andq $-16,%rsp @@ -2382,8 +3776,10 @@ bn_gather5: jnz .Lgather leaq (%r10),%rsp +.cfi_def_cfa_register %rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: +.cfi_endproc .size bn_gather5,.-bn_gather5 .align 64 .Linc: @@ -2391,3 +3787,4 @@ bn_gather5: .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S new file mode 100644 index 0000000000..9f7c0d817c --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S @@ -0,0 +1,518 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + + + + +.type abi_test_trampoline, @function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 16 +abi_test_trampoline: +.Labi_test_trampoline_seh_begin: +.cfi_startproc + + + + + + + + + + subq $120,%rsp +.cfi_adjust_cfa_offset 120 +.Labi_test_trampoline_seh_prolog_alloc: + movq %r8,48(%rsp) + movq %rbx,64(%rsp) +.cfi_offset rbx, -64 +.Labi_test_trampoline_seh_prolog_rbx: + movq %rbp,72(%rsp) +.cfi_offset rbp, -56 +.Labi_test_trampoline_seh_prolog_rbp: + movq %r12,80(%rsp) +.cfi_offset r12, -48 +.Labi_test_trampoline_seh_prolog_r12: + movq %r13,88(%rsp) +.cfi_offset r13, -40 +.Labi_test_trampoline_seh_prolog_r13: + movq %r14,96(%rsp) +.cfi_offset r14, -32 +.Labi_test_trampoline_seh_prolog_r14: + movq %r15,104(%rsp) +.cfi_offset r15, -24 +.Labi_test_trampoline_seh_prolog_r15: +.Labi_test_trampoline_seh_prolog_end: + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + + + + + movq %rdx,%r10 + movq %rcx,%r11 + decq %r11 + js .Largs_done + movq (%r10),%rdi + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rsi + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rdx + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rcx + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%r8 + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%r9 + addq $8,%r10 + leaq 0(%rsp),%rax +.Largs_loop: + decq %r11 + js .Largs_done + + + + + + + movq %r11,56(%rsp) + movq (%r10),%r11 + movq %r11,(%rax) + movq 56(%rsp),%r11 + + addq $8,%r10 + addq $8,%rax + jmp .Largs_loop + +.Largs_done: + movq 32(%rsp),%rax + movq 48(%rsp),%r10 + testq %r10,%r10 + jz .Lno_unwind + + + pushfq + orq $0x100,0(%rsp) + popfq + + + + nop +.globl abi_test_unwind_start +.hidden abi_test_unwind_start +abi_test_unwind_start: + + call *%rax +.globl abi_test_unwind_return +.hidden abi_test_unwind_return +abi_test_unwind_return: + + + + + pushfq + andq $-0x101,0(%rsp) + popfq +.globl abi_test_unwind_stop +.hidden abi_test_unwind_stop +abi_test_unwind_stop: + + jmp .Lcall_done + +.Lno_unwind: + call *%rax + +.Lcall_done: + + movq 40(%rsp),%rsi + movq %rbx,0(%rsi) + movq %rbp,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq 64(%rsp),%rbx +.cfi_restore rbx + movq 72(%rsp),%rbp +.cfi_restore rbp + movq 80(%rsp),%r12 +.cfi_restore r12 + movq 88(%rsp),%r13 +.cfi_restore r13 + movq 96(%rsp),%r14 +.cfi_restore r14 + movq 104(%rsp),%r15 +.cfi_restore r15 + addq $120,%rsp +.cfi_adjust_cfa_offset -120 + + + .byte 0xf3,0xc3 +.cfi_endproc +.Labi_test_trampoline_seh_end: +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_rax, @function +.globl abi_test_clobber_rax +.hidden abi_test_clobber_rax +.align 16 +abi_test_clobber_rax: + xorq %rax,%rax + .byte 0xf3,0xc3 +.size abi_test_clobber_rax,.-abi_test_clobber_rax +.type abi_test_clobber_rbx, @function +.globl abi_test_clobber_rbx +.hidden abi_test_clobber_rbx +.align 16 +abi_test_clobber_rbx: + xorq %rbx,%rbx + .byte 0xf3,0xc3 +.size abi_test_clobber_rbx,.-abi_test_clobber_rbx +.type abi_test_clobber_rcx, @function +.globl abi_test_clobber_rcx +.hidden abi_test_clobber_rcx +.align 16 +abi_test_clobber_rcx: + xorq %rcx,%rcx + .byte 0xf3,0xc3 +.size abi_test_clobber_rcx,.-abi_test_clobber_rcx +.type abi_test_clobber_rdx, @function +.globl abi_test_clobber_rdx +.hidden abi_test_clobber_rdx +.align 16 +abi_test_clobber_rdx: + xorq %rdx,%rdx + .byte 0xf3,0xc3 +.size abi_test_clobber_rdx,.-abi_test_clobber_rdx +.type abi_test_clobber_rdi, @function +.globl abi_test_clobber_rdi +.hidden abi_test_clobber_rdi +.align 16 +abi_test_clobber_rdi: + xorq %rdi,%rdi + .byte 0xf3,0xc3 +.size abi_test_clobber_rdi,.-abi_test_clobber_rdi +.type abi_test_clobber_rsi, @function +.globl abi_test_clobber_rsi +.hidden abi_test_clobber_rsi +.align 16 +abi_test_clobber_rsi: + xorq %rsi,%rsi + .byte 0xf3,0xc3 +.size abi_test_clobber_rsi,.-abi_test_clobber_rsi +.type abi_test_clobber_rbp, @function +.globl abi_test_clobber_rbp +.hidden abi_test_clobber_rbp +.align 16 +abi_test_clobber_rbp: + xorq %rbp,%rbp + .byte 0xf3,0xc3 +.size abi_test_clobber_rbp,.-abi_test_clobber_rbp +.type abi_test_clobber_r8, @function +.globl abi_test_clobber_r8 +.hidden abi_test_clobber_r8 +.align 16 +abi_test_clobber_r8: + xorq %r8,%r8 + .byte 0xf3,0xc3 +.size abi_test_clobber_r8,.-abi_test_clobber_r8 +.type abi_test_clobber_r9, @function +.globl abi_test_clobber_r9 +.hidden abi_test_clobber_r9 +.align 16 +abi_test_clobber_r9: + xorq %r9,%r9 + .byte 0xf3,0xc3 +.size abi_test_clobber_r9,.-abi_test_clobber_r9 +.type abi_test_clobber_r10, @function +.globl abi_test_clobber_r10 +.hidden abi_test_clobber_r10 +.align 16 +abi_test_clobber_r10: + xorq %r10,%r10 + .byte 0xf3,0xc3 +.size abi_test_clobber_r10,.-abi_test_clobber_r10 +.type abi_test_clobber_r11, @function +.globl abi_test_clobber_r11 +.hidden abi_test_clobber_r11 +.align 16 +abi_test_clobber_r11: + xorq %r11,%r11 + .byte 0xf3,0xc3 +.size abi_test_clobber_r11,.-abi_test_clobber_r11 +.type abi_test_clobber_r12, @function +.globl abi_test_clobber_r12 +.hidden abi_test_clobber_r12 +.align 16 +abi_test_clobber_r12: + xorq %r12,%r12 + .byte 0xf3,0xc3 +.size abi_test_clobber_r12,.-abi_test_clobber_r12 +.type abi_test_clobber_r13, @function +.globl abi_test_clobber_r13 +.hidden abi_test_clobber_r13 +.align 16 +abi_test_clobber_r13: + xorq %r13,%r13 + .byte 0xf3,0xc3 +.size abi_test_clobber_r13,.-abi_test_clobber_r13 +.type abi_test_clobber_r14, @function +.globl abi_test_clobber_r14 +.hidden abi_test_clobber_r14 +.align 16 +abi_test_clobber_r14: + xorq %r14,%r14 + .byte 0xf3,0xc3 +.size abi_test_clobber_r14,.-abi_test_clobber_r14 +.type abi_test_clobber_r15, @function +.globl abi_test_clobber_r15 +.hidden abi_test_clobber_r15 +.align 16 +abi_test_clobber_r15: + xorq %r15,%r15 + .byte 0xf3,0xc3 +.size abi_test_clobber_r15,.-abi_test_clobber_r15 +.type abi_test_clobber_xmm0, @function +.globl abi_test_clobber_xmm0 +.hidden abi_test_clobber_xmm0 +.align 16 +abi_test_clobber_xmm0: + pxor %xmm0,%xmm0 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm0,.-abi_test_clobber_xmm0 +.type abi_test_clobber_xmm1, @function +.globl abi_test_clobber_xmm1 +.hidden abi_test_clobber_xmm1 +.align 16 +abi_test_clobber_xmm1: + pxor %xmm1,%xmm1 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm1,.-abi_test_clobber_xmm1 +.type abi_test_clobber_xmm2, @function +.globl abi_test_clobber_xmm2 +.hidden abi_test_clobber_xmm2 +.align 16 +abi_test_clobber_xmm2: + pxor %xmm2,%xmm2 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm2,.-abi_test_clobber_xmm2 +.type abi_test_clobber_xmm3, @function +.globl abi_test_clobber_xmm3 +.hidden abi_test_clobber_xmm3 +.align 16 +abi_test_clobber_xmm3: + pxor %xmm3,%xmm3 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm3,.-abi_test_clobber_xmm3 +.type abi_test_clobber_xmm4, @function +.globl abi_test_clobber_xmm4 +.hidden abi_test_clobber_xmm4 +.align 16 +abi_test_clobber_xmm4: + pxor %xmm4,%xmm4 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm4,.-abi_test_clobber_xmm4 +.type abi_test_clobber_xmm5, @function +.globl abi_test_clobber_xmm5 +.hidden abi_test_clobber_xmm5 +.align 16 +abi_test_clobber_xmm5: + pxor %xmm5,%xmm5 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm5,.-abi_test_clobber_xmm5 +.type abi_test_clobber_xmm6, @function +.globl abi_test_clobber_xmm6 +.hidden abi_test_clobber_xmm6 +.align 16 +abi_test_clobber_xmm6: + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm6,.-abi_test_clobber_xmm6 +.type abi_test_clobber_xmm7, @function +.globl abi_test_clobber_xmm7 +.hidden abi_test_clobber_xmm7 +.align 16 +abi_test_clobber_xmm7: + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm7,.-abi_test_clobber_xmm7 +.type abi_test_clobber_xmm8, @function +.globl abi_test_clobber_xmm8 +.hidden abi_test_clobber_xmm8 +.align 16 +abi_test_clobber_xmm8: + pxor %xmm8,%xmm8 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm8,.-abi_test_clobber_xmm8 +.type abi_test_clobber_xmm9, @function +.globl abi_test_clobber_xmm9 +.hidden abi_test_clobber_xmm9 +.align 16 +abi_test_clobber_xmm9: + pxor %xmm9,%xmm9 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm9,.-abi_test_clobber_xmm9 +.type abi_test_clobber_xmm10, @function +.globl abi_test_clobber_xmm10 +.hidden abi_test_clobber_xmm10 +.align 16 +abi_test_clobber_xmm10: + pxor %xmm10,%xmm10 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm10,.-abi_test_clobber_xmm10 +.type abi_test_clobber_xmm11, @function +.globl abi_test_clobber_xmm11 +.hidden abi_test_clobber_xmm11 +.align 16 +abi_test_clobber_xmm11: + pxor %xmm11,%xmm11 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm11,.-abi_test_clobber_xmm11 +.type abi_test_clobber_xmm12, @function +.globl abi_test_clobber_xmm12 +.hidden abi_test_clobber_xmm12 +.align 16 +abi_test_clobber_xmm12: + pxor %xmm12,%xmm12 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm12,.-abi_test_clobber_xmm12 +.type abi_test_clobber_xmm13, @function +.globl abi_test_clobber_xmm13 +.hidden abi_test_clobber_xmm13 +.align 16 +abi_test_clobber_xmm13: + pxor %xmm13,%xmm13 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm13,.-abi_test_clobber_xmm13 +.type abi_test_clobber_xmm14, @function +.globl abi_test_clobber_xmm14 +.hidden abi_test_clobber_xmm14 +.align 16 +abi_test_clobber_xmm14: + pxor %xmm14,%xmm14 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm14,.-abi_test_clobber_xmm14 +.type abi_test_clobber_xmm15, @function +.globl abi_test_clobber_xmm15 +.hidden abi_test_clobber_xmm15 +.align 16 +abi_test_clobber_xmm15: + pxor %xmm15,%xmm15 + .byte 0xf3,0xc3 +.size abi_test_clobber_xmm15,.-abi_test_clobber_xmm15 + + + +.type abi_test_bad_unwind_wrong_register, @function +.globl abi_test_bad_unwind_wrong_register +.hidden abi_test_bad_unwind_wrong_register +.align 16 +abi_test_bad_unwind_wrong_register: +.cfi_startproc +.Labi_test_bad_unwind_wrong_register_seh_begin: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-16 +.Labi_test_bad_unwind_wrong_register_seh_push_r13: + + + + nop + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + .byte 0xf3,0xc3 +.Labi_test_bad_unwind_wrong_register_seh_end: +.cfi_endproc +.size abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register + + + + +.type abi_test_bad_unwind_temporary, @function +.globl abi_test_bad_unwind_temporary +.hidden abi_test_bad_unwind_temporary +.align 16 +abi_test_bad_unwind_temporary: +.cfi_startproc +.Labi_test_bad_unwind_temporary_seh_begin: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 +.Labi_test_bad_unwind_temporary_seh_push_r12: + + movq %r12,%rax + incq %rax + movq %rax,(%rsp) + + + + movq %r12,(%rsp) + + + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + .byte 0xf3,0xc3 +.Labi_test_bad_unwind_temporary_seh_end: +.cfi_endproc +.size abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary + + + + +.type abi_test_set_direction_flag, @function +.globl abi_test_get_and_clear_direction_flag +.hidden abi_test_get_and_clear_direction_flag +abi_test_get_and_clear_direction_flag: + pushfq + popq %rax + andq $0x400,%rax + shrq $10,%rax + cld + .byte 0xf3,0xc3 +.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag + + + +.type abi_test_set_direction_flag, @function +.globl abi_test_set_direction_flag +.hidden abi_test_set_direction_flag +abi_test_set_direction_flag: + std + .byte 0xf3,0xc3 +.size abi_test_set_direction_flag,.-abi_test_set_direction_flag +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S new file mode 100644 index 0000000000..07f708aa72 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/third_party/sike/asm/fp-x86_64.S @@ -0,0 +1,1871 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +.Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF +.quad 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC +.quad 0x0004683E4E2EE688 + + +.Lp434p1: +.quad 0xFDC1767AE3000000 +.quad 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056 +.quad 0x0002341F27177344 + +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P +.globl sike_fpadd +.hidden sike_fpadd +.type sike_fpadd,@function +sike_fpadd: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + addq 0(%rsi),%r8 + movq 8(%rdi),%r9 + adcq 8(%rsi),%r9 + movq 16(%rdi),%r10 + adcq 16(%rsi),%r10 + movq 24(%rdi),%r11 + adcq 24(%rsi),%r11 + movq 32(%rdi),%r12 + adcq 32(%rsi),%r12 + movq 40(%rdi),%r13 + adcq 40(%rsi),%r13 + movq 48(%rdi),%r14 + adcq 48(%rsi),%r14 + + movq .Lp434x2(%rip),%rcx + subq %rcx,%r8 + movq 8+.Lp434x2(%rip),%rcx + sbbq %rcx,%r9 + sbbq %rcx,%r10 + movq 16+.Lp434x2(%rip),%rcx + sbbq %rcx,%r11 + movq 24+.Lp434x2(%rip),%rcx + sbbq %rcx,%r12 + movq 32+.Lp434x2(%rip),%rcx + sbbq %rcx,%r13 + movq 40+.Lp434x2(%rip),%rcx + sbbq %rcx,%r14 + + sbbq $0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0,%rcx + + adcq %r8,%r12 + movq %r12,32(%rdx) + adcq %r9,%r13 + movq %r13,40(%rdx) + adcq %r10,%r14 + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_cswap_asm +.hidden sike_cswap_asm +.type sike_cswap_asm,@function +sike_cswap_asm: + + + movq %rdx,%xmm3 + + + + + + pshufd $68,%xmm3,%xmm3 + + movdqu 0(%rdi),%xmm0 + movdqu 0(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,0(%rdi) + movdqu %xmm1,0(%rsi) + + movdqu 16(%rdi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,16(%rdi) + movdqu %xmm1,16(%rsi) + + movdqu 32(%rdi),%xmm0 + movdqu 32(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,32(%rdi) + movdqu %xmm1,32(%rsi) + + movdqu 48(%rdi),%xmm0 + movdqu 48(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,48(%rdi) + movdqu %xmm1,48(%rsi) + + movdqu 64(%rdi),%xmm0 + movdqu 64(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,64(%rdi) + movdqu %xmm1,64(%rsi) + + movdqu 80(%rdi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,80(%rdi) + movdqu %xmm1,80(%rsi) + + movdqu 96(%rdi),%xmm0 + movdqu 96(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,96(%rdi) + movdqu %xmm1,96(%rsi) + + movdqu 112(%rdi),%xmm0 + movdqu 112(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,112(%rdi) + movdqu %xmm1,112(%rsi) + + movdqu 128(%rdi),%xmm0 + movdqu 128(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,128(%rdi) + movdqu %xmm1,128(%rsi) + + movdqu 144(%rdi),%xmm0 + movdqu 144(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,144(%rdi) + movdqu %xmm1,144(%rsi) + + movdqu 160(%rdi),%xmm0 + movdqu 160(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,160(%rdi) + movdqu %xmm1,160(%rsi) + + movdqu 176(%rdi),%xmm0 + movdqu 176(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,176(%rdi) + movdqu %xmm1,176(%rsi) + + movdqu 192(%rdi),%xmm0 + movdqu 192(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,192(%rdi) + movdqu %xmm1,192(%rsi) + + movdqu 208(%rdi),%xmm0 + movdqu 208(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,208(%rdi) + movdqu %xmm1,208(%rsi) + + .byte 0xf3,0xc3 +.globl sike_fpsub +.hidden sike_fpsub +.type sike_fpsub,@function +sike_fpsub: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + subq 0(%rsi),%r8 + movq 8(%rdi),%r9 + sbbq 8(%rsi),%r9 + movq 16(%rdi),%r10 + sbbq 16(%rsi),%r10 + movq 24(%rdi),%r11 + sbbq 24(%rsi),%r11 + movq 32(%rdi),%r12 + sbbq 32(%rsi),%r12 + movq 40(%rdi),%r13 + sbbq 40(%rsi),%r13 + movq 48(%rdi),%r14 + sbbq 48(%rsi),%r14 + + sbbq $0x0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0x0,%rcx + + adcq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpadd_asm +.hidden sike_mpadd_asm +.type sike_mpadd_asm,@function +sike_mpadd_asm: +.cfi_startproc + movq 0(%rdi),%r8; + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + adcq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + adcq 40(%rsi),%r8 + adcq 48(%rsi),%r9 + movq %r8,40(%rdx) + movq %r9,48(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpsubx2_asm +.hidden sike_mpsubx2_asm +.type sike_mpsubx2_asm,@function +sike_mpsubx2_asm: +.cfi_startproc + xorq %rax,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + movq 56(%rdi),%r10 + movq 64(%rdi),%r11 + movq 72(%rdi),%rcx + sbbq 40(%rsi),%r8 + sbbq 48(%rsi),%r9 + sbbq 56(%rsi),%r10 + sbbq 64(%rsi),%r11 + sbbq 72(%rsi),%rcx + movq %r8,40(%rdx) + movq %r9,48(%rdx) + movq %r10,56(%rdx) + movq %r11,64(%rdx) + movq %rcx,72(%rdx) + + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + sbbq 80(%rsi),%r8 + sbbq 88(%rsi),%r9 + sbbq 96(%rsi),%r10 + sbbq 104(%rsi),%r11 + sbbq $0x0,%rax + movq %r8,80(%rdx) + movq %r9,88(%rdx) + movq %r10,96(%rdx) + movq %r11,104(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpdblsubx2_asm +.hidden sike_mpdblsubx2_asm +.type sike_mpdblsubx2_asm,@function +sike_mpdblsubx2_asm: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + + xorq %rax,%rax + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + movq 48(%rdx),%rcx + subq 0(%rdi),%r8 + sbbq 8(%rdi),%r9 + sbbq 16(%rdi),%r10 + sbbq 24(%rdi),%r11 + sbbq 32(%rdi),%r12 + sbbq 40(%rdi),%r13 + sbbq 48(%rdi),%rcx + adcq $0x0,%rax + + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%r12 + sbbq 40(%rsi),%r13 + sbbq 48(%rsi),%rcx + adcq $0x0,%rax + + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %rcx,48(%rdx) + + + movq 56(%rdx),%r8 + movq 64(%rdx),%r9 + movq 72(%rdx),%r10 + movq 80(%rdx),%r11 + movq 88(%rdx),%r12 + movq 96(%rdx),%r13 + movq 104(%rdx),%rcx + + subq %rax,%r8 + sbbq 56(%rdi),%r8 + sbbq 64(%rdi),%r9 + sbbq 72(%rdi),%r10 + sbbq 80(%rdi),%r11 + sbbq 88(%rdi),%r12 + sbbq 96(%rdi),%r13 + sbbq 104(%rdi),%rcx + + + subq 56(%rsi),%r8 + sbbq 64(%rsi),%r9 + sbbq 72(%rsi),%r10 + sbbq 80(%rsi),%r11 + sbbq 88(%rsi),%r12 + sbbq 96(%rsi),%r13 + sbbq 104(%rsi),%rcx + + + movq %r8,56(%rdx) + movq %r9,64(%rdx) + movq %r10,72(%rdx) + movq %r11,80(%rdx) + movq %r12,88(%rdx) + movq %r13,96(%rdx) + movq %rcx,104(%rdx) + + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc + +.Lrdc_bdw: +.cfi_startproc + +.cfi_adjust_cfa_offset 32 +.cfi_offset r12, -16 +.cfi_offset r13, -24 +.cfi_offset r14, -32 +.cfi_offset r15, -40 + + xorq %rax,%rax + movq 0+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 0+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + xorq %rax,%rax + movq 16+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 16+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + xorq %rax,%rax + movq 32+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 32+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + xorq %rax,%rax + movq 48(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r12 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_fprdc +.hidden sike_fprdc +.type sike_fprdc,@function +sike_fprdc: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + + + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lrdc_bdw + + + + + movq 0+0(%rdi),%r14 + movq 0+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 0+8(%rdi),%rcx + movq 0+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + + movq 16+0(%rdi),%r14 + movq 0+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 16+8(%rdi),%rcx + movq 0+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + + movq 32+0(%rdi),%r14 + movq 0+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 32+8(%rdi),%rcx + movq 0+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+.Lp434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+.Lp434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + movq 48(%rdi),%r13 + + xorq %r10,%r10 + movq 0+.Lp434p1(%rip),%rax + mulq %r13 + movq %rax,%r8 + movq %rdx,%r9 + + xorq %r11,%r11 + movq 8+.Lp434p1(%rip),%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + + xorq %r12,%r12 + movq 16+.Lp434p1(%rip),%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r11 + + movq 24+.Lp434p1(%rip),%rax + mulq %r13 + addq %rax,%r11 + adcq %rdx,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.Lmul_bdw: +.cfi_startproc + +.cfi_adjust_cfa_offset 32 +.cfi_offset r12, -16 +.cfi_offset r13, -24 +.cfi_offset r14, -32 +.cfi_offset r15, -40 + + + movq %rdx,%rcx + xorq %rax,%rax + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx, -48 + pushq %rbp +.cfi_offset rbp, -56 +.cfi_adjust_cfa_offset 8 + subq $96,%rsp +.cfi_adjust_cfa_offset 96 + + addq 32(%rdi),%r8 + adcq 40(%rdi),%r9 + adcq 48(%rdi),%r10 + adcq $0x0,%r11 + sbbq $0x0,%rax + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + + + xorq %rbx,%rbx + movq 0(%rsi),%r12 + movq 8(%rsi),%r13 + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + addq 32(%rsi),%r12 + adcq 40(%rsi),%r13 + adcq 48(%rsi),%r14 + adcq $0x0,%r15 + sbbq $0x0,%rbx + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + + + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + andq %rbx,%r8 + andq %rbx,%r9 + andq %rbx,%r10 + andq %rbx,%r11 + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + + movq 0+0(%rsp),%rdx + mulxq 32+0(%rsp),%r9,%r8 + movq %r9,0+0(%rsp) + mulxq 32+8(%rsp),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsp),%r11,%r10 + adoxq %r11,%r9 + mulxq 32+24(%rsp),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rsp),%rdx + mulxq 32+0(%rsp),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rsp) + adcxq %r15,%r13 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 32+8(%rsp),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rsp) + adcxq %r11,%r9 + mulxq 32+16(%rsp),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 32+24(%rsp),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r13 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rsp) + movq %r10,0+32(%rsp) + movq %r11,0+40(%rsp) + movq %r12,0+48(%rsp) + movq %rbx,0+56(%rsp) + + + + movq 0+0(%rdi),%rdx + mulxq 0+0(%rsi),%r9,%r8 + movq %r9,0+0(%rcx) + mulxq 0+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 0+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + mulxq 0+24(%rsi),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rdi),%rdx + mulxq 0+0(%rsi),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rcx) + adcxq %r15,%r13 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 0+8(%rsi),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rcx) + adcxq %r11,%r9 + mulxq 0+16(%rsi),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 0+24(%rsi),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r13 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rcx) + movq %r10,0+32(%rcx) + movq %r11,0+40(%rcx) + movq %r12,0+48(%rcx) + movq %rbx,0+56(%rcx) + + + + movq 32+0(%rdi),%rdx + mulxq 32+0(%rsi),%r9,%r8 + movq %r9,64+0(%rcx) + mulxq 32+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + + movq 32+8(%rdi),%rdx + mulxq 32+0(%rsi),%r12,%r11 + adoxq %rax,%r10 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r14,%r13 + adoxq %r8,%r12 + movq %r12,64+8(%rcx) + adcxq %r14,%r11 + + mulxq 32+16(%rsi),%r8,%r14 + adoxq %r9,%r11 + adcxq %r8,%r13 + adcxq %rax,%r14 + adoxq %r10,%r13 + + movq 32+16(%rdi),%rdx + mulxq 32+0(%rsi),%r8,%r9 + adoxq %rax,%r14 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r10,%r12 + adoxq %r11,%r8 + movq %r8,64+16(%rcx) + adcxq %r13,%r9 + + mulxq 32+16(%rsi),%r11,%r8 + adcxq %r14,%r12 + adcxq %rax,%r8 + adoxq %r10,%r9 + adoxq %r12,%r11 + adoxq %rax,%r8 + movq %r9,64+24(%rcx) + movq %r11,64+32(%rcx) + movq %r8,64+40(%rcx) + + + + + movq 64(%rsp),%r8 + movq 72(%rsp),%r9 + movq 80(%rsp),%r10 + movq 88(%rsp),%r11 + + movq 32(%rsp),%rax + addq %rax,%r8 + movq 40(%rsp),%rax + adcq %rax,%r9 + movq 48(%rsp),%rax + adcq %rax,%r10 + movq 56(%rsp),%rax + adcq %rax,%r11 + + + movq 0(%rsp),%r12 + movq 8(%rsp),%r13 + movq 16(%rsp),%r14 + movq 24(%rsp),%r15 + subq 0(%rcx),%r12 + sbbq 8(%rcx),%r13 + sbbq 16(%rcx),%r14 + sbbq 24(%rcx),%r15 + sbbq 32(%rcx),%r8 + sbbq 40(%rcx),%r9 + sbbq 48(%rcx),%r10 + sbbq 56(%rcx),%r11 + + + subq 64(%rcx),%r12 + sbbq 72(%rcx),%r13 + sbbq 80(%rcx),%r14 + sbbq 88(%rcx),%r15 + sbbq 96(%rcx),%r8 + sbbq 104(%rcx),%r9 + sbbq $0x0,%r10 + sbbq $0x0,%r11 + + addq 32(%rcx),%r12 + movq %r12,32(%rcx) + adcq 40(%rcx),%r13 + movq %r13,40(%rcx) + adcq 48(%rcx),%r14 + movq %r14,48(%rcx) + adcq 56(%rcx),%r15 + movq %r15,56(%rcx) + adcq 64(%rcx),%r8 + movq %r8,64(%rcx) + adcq 72(%rcx),%r9 + movq %r9,72(%rcx) + adcq 80(%rcx),%r10 + movq %r10,80(%rcx) + adcq 88(%rcx),%r11 + movq %r11,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + movq 104(%rcx),%r13 + adcq $0x0,%r13 + movq %r13,104(%rcx) + + addq $96,%rsp +.cfi_adjust_cfa_offset -96 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbx + + + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_same_value r12 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl sike_mpmul +.hidden sike_mpmul +.type sike_mpmul,@function +sike_mpmul: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + + + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lmul_bdw + + + + movq %rdx,%rcx + + subq $112,%rsp +.cfi_adjust_cfa_offset 112 + + + xorq %rax,%rax + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + xorq %r11,%r11 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + + sbbq $0,%rax + movq %rax,64(%rsp) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + + + xorq %rdx,%rdx + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + xorq %r15,%r15 + addq 0(%rsi),%r12 + adcq 8(%rsi),%r13 + adcq 16(%rsi),%r14 + adcq 24(%rsi),%r15 + sbbq $0x0,%rdx + + movq %rdx,72(%rsp) + + + movq (%rcx),%rax + mulq %r12 + movq %rax,(%rsp) + movq %rdx,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq (%rcx),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rcx),%rax + mulq %r12 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rcx),%rax + mulq %r13 + addq %rax,%r9 + movq %r9,16(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 24(%rcx),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rcx),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rcx),%rax + mulq %r13 + addq %rax,%r10 + movq %r10,24(%rsp) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 24(%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rcx),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r11,%r11 + movq 16(%rcx),%rax + mulq %r15 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r14 + addq %rax,%r9 + movq %r9,40(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rsp) + adcq %rdx,%r11 + movq %r11,56(%rsp) + + + movq 64(%rsp),%rax + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + movq 72(%rsp),%rax + movq 0(%rcx),%r8 + andq %rax,%r8 + movq 8(%rcx),%r9 + andq %rax,%r9 + movq 16(%rcx),%r10 + andq %rax,%r10 + movq 24(%rcx),%r11 + andq %rax,%r11 + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + + + movq 32(%rsp),%rax + addq %rax,%r12 + movq 40(%rsp),%rax + adcq %rax,%r13 + movq 48(%rsp),%rax + adcq %rax,%r14 + movq 56(%rsp),%rax + adcq %rax,%r15 + movq %r12,80(%rsp) + movq %r13,88(%rsp) + movq %r14,96(%rsp) + movq %r15,104(%rsp) + + + movq (%rdi),%r11 + movq (%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,(%rcx) + movq %rdx,%r8 + + movq 16(%rdi),%r14 + movq 8(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 8(%rdi),%r12 + movq (%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 16(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq (%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,16(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq 24(%rsi),%rax + mulq %r11 + movq 24(%rdi),%r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq %r15,%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rsi),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rsi),%rax + mulq %r14 + addq %rax,%r10 + movq %r10,24(%rcx) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 24(%rsi),%rax + mulq %r12 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 8(%rsi),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rsi),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 24(%rsi),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rsi),%rax + mulq %r15 + addq %rax,%r9 + movq %r9,40(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 24(%rsi),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rcx) + adcq %rdx,%r8 + movq %r8,56(%rcx) + + + + movq 32(%rdi),%r11 + movq 32(%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,64(%rcx) + movq %rdx,%r8 + + movq 48(%rdi),%r14 + movq 40(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 40(%rdi),%r12 + movq 32(%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,72(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 48(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 32(%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 40(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,80(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 48(%rsi),%rax + mulq %r12 + xorq %r12,%r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + + movq 40(%rsi),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + movq %r10,88(%rcx) + + movq 48(%rsi),%rax + mulq %r14 + addq %rax,%r8 + adcq $0x0,%r12 + movq %r8,96(%rcx) + + addq %r12,%rdx + + + movq 0(%rsp),%r8 + subq 0(%rcx),%r8 + movq 8(%rsp),%r9 + sbbq 8(%rcx),%r9 + movq 16(%rsp),%r10 + sbbq 16(%rcx),%r10 + movq 24(%rsp),%r11 + sbbq 24(%rcx),%r11 + movq 80(%rsp),%r12 + sbbq 32(%rcx),%r12 + movq 88(%rsp),%r13 + sbbq 40(%rcx),%r13 + movq 96(%rsp),%r14 + sbbq 48(%rcx),%r14 + movq 104(%rsp),%r15 + sbbq 56(%rcx),%r15 + + + movq 64(%rcx),%rax + subq %rax,%r8 + movq 72(%rcx),%rax + sbbq %rax,%r9 + movq 80(%rcx),%rax + sbbq %rax,%r10 + movq 88(%rcx),%rax + sbbq %rax,%r11 + movq 96(%rcx),%rax + sbbq %rax,%r12 + sbbq %rdx,%r13 + sbbq $0x0,%r14 + sbbq $0x0,%r15 + + + addq 32(%rcx),%r8 + movq %r8,32(%rcx) + adcq 40(%rcx),%r9 + movq %r9,40(%rcx) + adcq 48(%rcx),%r10 + movq %r10,48(%rcx) + adcq 56(%rcx),%r11 + movq %r11,56(%rcx) + adcq 64(%rcx),%r12 + movq %r12,64(%rcx) + adcq 72(%rcx),%r13 + movq %r13,72(%rcx) + adcq 80(%rcx),%r14 + movq %r14,80(%rcx) + adcq 88(%rcx),%r15 + movq %r15,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + adcq $0x0,%rdx + movq %rdx,104(%rcx) + + addq $112,%rsp +.cfi_adjust_cfa_offset -112 + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +#endif +.section .note.GNU-stack,"",@progbits diff --git a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S index e87467caf0..bc324888b6 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _ChaCha20_ctr32 .private_extern _ChaCha20_ctr32 diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S index 4046251d3a..3634f64d11 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .private_extern __x86_AES_encrypt_compact .align 4 @@ -961,11 +967,11 @@ LAES_Te: .long 16,32,64,128 .long 27,54,0,0 .long 0,0,0,0 -.globl _asm_AES_encrypt -.private_extern _asm_AES_encrypt +.globl _aes_nohw_encrypt +.private_extern _aes_nohw_encrypt .align 4 -_asm_AES_encrypt: -L_asm_AES_encrypt_begin: +_aes_nohw_encrypt: +L_aes_nohw_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2145,11 +2151,11 @@ LAES_Td: .byte 200,235,187,60,131,83,153,97 .byte 23,43,4,126,186,119,214,38 .byte 225,105,20,99,85,33,12,125 -.globl _asm_AES_decrypt -.private_extern _asm_AES_decrypt +.globl _aes_nohw_decrypt +.private_extern _aes_nohw_decrypt .align 4 -_asm_AES_decrypt: -L_asm_AES_decrypt_begin: +_aes_nohw_decrypt: +L_aes_nohw_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2209,11 +2215,11 @@ L011x86: popl %ebx popl %ebp ret -.globl _asm_AES_cbc_encrypt -.private_extern _asm_AES_cbc_encrypt +.globl _aes_nohw_cbc_encrypt +.private_extern _aes_nohw_cbc_encrypt .align 4 -_asm_AES_cbc_encrypt: -L_asm_AES_cbc_encrypt_begin: +_aes_nohw_cbc_encrypt: +L_aes_nohw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -2970,18 +2976,18 @@ L045exit: popl %ebx popl %ebp ret -.globl _asm_AES_set_encrypt_key -.private_extern _asm_AES_set_encrypt_key +.globl _aes_nohw_set_encrypt_key +.private_extern _aes_nohw_set_encrypt_key .align 4 -_asm_AES_set_encrypt_key: -L_asm_AES_set_encrypt_key_begin: +_aes_nohw_set_encrypt_key: +L_aes_nohw_set_encrypt_key_begin: call __x86_AES_set_encrypt_key ret -.globl _asm_AES_set_decrypt_key -.private_extern _asm_AES_set_decrypt_key +.globl _aes_nohw_set_decrypt_key +.private_extern _aes_nohw_set_decrypt_key .align 4 -_asm_AES_set_decrypt_key: -L_asm_AES_set_decrypt_key_begin: +_aes_nohw_set_decrypt_key: +L_aes_nohw_set_decrypt_key_begin: call __x86_AES_set_encrypt_key cmpl $0,%eax je L054proceed diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S index 3fe0e7543f..db7efffdf8 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S @@ -1,10 +1,30 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text -.globl _aesni_encrypt -.private_extern _aesni_encrypt +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt .align 4 -_aesni_encrypt: -L_aesni_encrypt_begin: +_aes_hw_encrypt: +L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L000pic +L000pic: + popl %ebx + leal _BORINGSSL_function_hit+1-L000pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -14,23 +34,23 @@ L_aesni_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L000enc1_loop_1: +L001enc1_loop_1: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L000enc1_loop_1 + jnz L001enc1_loop_1 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 movups %xmm2,(%eax) pxor %xmm2,%xmm2 ret -.globl _aesni_decrypt -.private_extern _aesni_decrypt +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt .align 4 -_aesni_decrypt: -L_aesni_decrypt_begin: +_aes_hw_decrypt: +L_aes_hw_decrypt_begin: movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -40,12 +60,12 @@ L_aesni_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L001dec1_loop_2: +L002dec1_loop_2: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L001dec1_loop_2 + jnz L002dec1_loop_2 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -64,7 +84,7 @@ __aesni_encrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L002enc2_loop: +L003enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -72,7 +92,7 @@ L002enc2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L002enc2_loop + jnz L003enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,221,208 @@ -90,7 +110,7 @@ __aesni_decrypt2: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L003dec2_loop: +L004dec2_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 movups (%edx,%ecx,1),%xmm1 @@ -98,7 +118,7 @@ L003dec2_loop: .byte 102,15,56,222,208 .byte 102,15,56,222,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L003dec2_loop + jnz L004dec2_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,223,208 @@ -117,7 +137,7 @@ __aesni_encrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L004enc3_loop: +L005enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -127,7 +147,7 @@ L004enc3_loop: .byte 102,15,56,220,216 .byte 102,15,56,220,224 movups -16(%edx,%ecx,1),%xmm0 - jnz L004enc3_loop + jnz L005enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -148,7 +168,7 @@ __aesni_decrypt3: leal 32(%edx,%ecx,1),%edx negl %ecx addl $16,%ecx -L005dec3_loop: +L006dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -158,7 +178,7 @@ L005dec3_loop: .byte 102,15,56,222,216 .byte 102,15,56,222,224 movups -16(%edx,%ecx,1),%xmm0 - jnz L005dec3_loop + jnz L006dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -181,7 +201,7 @@ __aesni_encrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -L006enc4_loop: +L007enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -193,7 +213,7 @@ L006enc4_loop: .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups -16(%edx,%ecx,1),%xmm0 - jnz L006enc4_loop + jnz L007enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -218,7 +238,7 @@ __aesni_decrypt4: negl %ecx .byte 15,31,64,0 addl $16,%ecx -L007dec4_loop: +L008dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -230,7 +250,7 @@ L007dec4_loop: .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups -16(%edx,%ecx,1),%xmm0 - jnz L007dec4_loop + jnz L008dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -259,13 +279,13 @@ __aesni_encrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp L008_aesni_encrypt6_inner + jmp L009_aesni_encrypt6_inner .align 4,0x90 -L009enc6_loop: +L010enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 -L008_aesni_encrypt6_inner: +L009_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -279,7 +299,7 @@ L_aesni_encrypt6_enter: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L009enc6_loop + jnz L010enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -312,13 +332,13 @@ __aesni_decrypt6: pxor %xmm0,%xmm7 movups (%edx,%ecx,1),%xmm0 addl $16,%ecx - jmp L010_aesni_decrypt6_inner + jmp L011_aesni_decrypt6_inner .align 4,0x90 -L011dec6_loop: +L012dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 -L010_aesni_decrypt6_inner: +L011_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -332,7 +352,7 @@ L_aesni_decrypt6_enter: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L011dec6_loop + jnz L012dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -346,11 +366,11 @@ L_aesni_decrypt6_enter: .byte 102,15,56,223,240 .byte 102,15,56,223,248 ret -.globl _aesni_ecb_encrypt -.private_extern _aesni_ecb_encrypt +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt .align 4 -_aesni_ecb_encrypt: -L_aesni_ecb_encrypt_begin: +_aes_hw_ecb_encrypt: +L_aes_hw_ecb_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -361,14 +381,14 @@ L_aesni_ecb_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz L012ecb_ret + jz L013ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz L013ecb_decrypt + jz L014ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L014ecb_enc_tail + jb L015ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -377,9 +397,9 @@ L_aesni_ecb_encrypt_begin: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L015ecb_enc_loop6_enter + jmp L016ecb_enc_loop6_enter .align 4,0x90 -L016ecb_enc_loop6: +L017ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -394,12 +414,12 @@ L016ecb_enc_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L015ecb_enc_loop6_enter: +L016ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L016ecb_enc_loop6 + jnc L017ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -408,18 +428,18 @@ L015ecb_enc_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L012ecb_ret -L014ecb_enc_tail: + jz L013ecb_ret +L015ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L017ecb_enc_one + jb L018ecb_enc_one movups 16(%esi),%xmm3 - je L018ecb_enc_two + je L019ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L019ecb_enc_three + jb L020ecb_enc_three movups 48(%esi),%xmm5 - je L020ecb_enc_four + je L021ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -428,49 +448,49 @@ L014ecb_enc_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L017ecb_enc_one: +L018ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L021enc1_loop_3: +L022enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L021enc1_loop_3 + jnz L022enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L018ecb_enc_two: +L019ecb_enc_two: call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L019ecb_enc_three: +L020ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L020ecb_enc_four: +L021ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L013ecb_decrypt: +L014ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L022ecb_dec_tail + jb L023ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -479,9 +499,9 @@ L013ecb_decrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L023ecb_dec_loop6_enter + jmp L024ecb_dec_loop6_enter .align 4,0x90 -L024ecb_dec_loop6: +L025ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -496,12 +516,12 @@ L024ecb_dec_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L023ecb_dec_loop6_enter: +L024ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L024ecb_dec_loop6 + jnc L025ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -510,18 +530,18 @@ L023ecb_dec_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L012ecb_ret -L022ecb_dec_tail: + jz L013ecb_ret +L023ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L025ecb_dec_one + jb L026ecb_dec_one movups 16(%esi),%xmm3 - je L026ecb_dec_two + je L027ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L027ecb_dec_three + jb L028ecb_dec_three movups 48(%esi),%xmm5 - je L028ecb_dec_four + je L029ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -530,43 +550,43 @@ L022ecb_dec_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L025ecb_dec_one: +L026ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L029dec1_loop_4: +L030dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L029dec1_loop_4 + jnz L030dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L026ecb_dec_two: +L027ecb_dec_two: call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L027ecb_dec_three: +L028ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L012ecb_ret + jmp L013ecb_ret .align 4,0x90 -L028ecb_dec_four: +L029ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L012ecb_ret: +L013ecb_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -580,11 +600,11 @@ L012ecb_ret: popl %ebx popl %ebp ret -.globl _aesni_ccm64_encrypt_blocks -.private_extern _aesni_ccm64_encrypt_blocks +.globl _aes_hw_ccm64_encrypt_blocks +.private_extern _aes_hw_ccm64_encrypt_blocks .align 4 -_aesni_ccm64_encrypt_blocks: -L_aesni_ccm64_encrypt_blocks_begin: +_aes_hw_ccm64_encrypt_blocks: +L_aes_hw_ccm64_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -620,7 +640,7 @@ L_aesni_ccm64_encrypt_blocks_begin: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -L030ccm64_enc_outer: +L031ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -629,7 +649,7 @@ L030ccm64_enc_outer: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -L031ccm64_enc2_loop: +L032ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -637,7 +657,7 @@ L031ccm64_enc2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L031ccm64_enc2_loop + jnz L032ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -650,7 +670,7 @@ L031ccm64_enc2_loop: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz L030ccm64_enc_outer + jnz L031ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -667,11 +687,11 @@ L031ccm64_enc2_loop: popl %ebx popl %ebp ret -.globl _aesni_ccm64_decrypt_blocks -.private_extern _aesni_ccm64_decrypt_blocks +.globl _aes_hw_ccm64_decrypt_blocks +.private_extern _aes_hw_ccm64_decrypt_blocks .align 4 -_aesni_ccm64_decrypt_blocks: -L_aesni_ccm64_decrypt_blocks_begin: +_aes_hw_ccm64_decrypt_blocks: +L_aes_hw_ccm64_decrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi @@ -708,12 +728,12 @@ L_aesni_ccm64_decrypt_blocks_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L032enc1_loop_5: +L033enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L032enc1_loop_5 + jnz L033enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -723,16 +743,16 @@ L032enc1_loop_5: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp L033ccm64_dec_outer + jmp L034ccm64_dec_outer .align 4,0x90 -L033ccm64_dec_outer: +L034ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz L034ccm64_dec_break + jz L035ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -740,7 +760,7 @@ L033ccm64_dec_outer: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -L035ccm64_dec2_loop: +L036ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -748,7 +768,7 @@ L035ccm64_dec2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L035ccm64_dec2_loop + jnz L036ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -756,9 +776,9 @@ L035ccm64_dec2_loop: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp L033ccm64_dec_outer + jmp L034ccm64_dec_outer .align 4,0x90 -L034ccm64_dec_break: +L035ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -766,12 +786,12 @@ L034ccm64_dec_break: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -L036enc1_loop_6: +L037enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L036enc1_loop_6 + jnz L037enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -789,15 +809,27 @@ L036enc1_loop_6: popl %ebx popl %ebp ret -.globl _aesni_ctr32_encrypt_blocks -.private_extern _aesni_ctr32_encrypt_blocks +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks .align 4 -_aesni_ctr32_encrypt_blocks: -L_aesni_ctr32_encrypt_blocks_begin: +_aes_hw_ctr32_encrypt_blocks: +L_aes_hw_ctr32_encrypt_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L038pic +L038pic: + popl %ebx + leal _BORINGSSL_function_hit+0-L038pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax @@ -808,7 +840,7 @@ L_aesni_ctr32_encrypt_blocks_begin: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je L037ctr32_one_shortcut + je L039ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -846,7 +878,7 @@ L_aesni_ctr32_encrypt_blocks_begin: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb L038ctr32_tail + jb L040ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -855,9 +887,9 @@ L_aesni_ctr32_encrypt_blocks_begin: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp L039ctr32_loop6 + jmp L041ctr32_loop6 .align 4,0x90 -L039ctr32_loop6: +L041ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -911,27 +943,27 @@ L039ctr32_loop6: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc L039ctr32_loop6 + jnc L041ctr32_loop6 addl $6,%eax - jz L040ctr32_ret + jz L042ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -L038ctr32_tail: +L040ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb L041ctr32_one + jb L043ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je L042ctr32_two + je L044ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb L043ctr32_three + jb L045ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je L044ctr32_four + je L046ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -949,29 +981,29 @@ L038ctr32_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L037ctr32_one_shortcut: +L039ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -L041ctr32_one: +L043ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L045enc1_loop_7: +L047enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L045enc1_loop_7 + jnz L047enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L042ctr32_two: +L044ctr32_two: call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -979,9 +1011,9 @@ L042ctr32_two: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L043ctr32_three: +L045ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -992,9 +1024,9 @@ L043ctr32_three: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L040ctr32_ret + jmp L042ctr32_ret .align 4,0x90 -L044ctr32_four: +L046ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -1008,7 +1040,7 @@ L044ctr32_four: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L040ctr32_ret: +L042ctr32_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1026,11 +1058,11 @@ L040ctr32_ret: popl %ebx popl %ebp ret -.globl _aesni_xts_encrypt -.private_extern _aesni_xts_encrypt +.globl _aes_hw_xts_encrypt +.private_extern _aes_hw_xts_encrypt .align 4 -_aesni_xts_encrypt: -L_aesni_xts_encrypt_begin: +_aes_hw_xts_encrypt: +L_aes_hw_xts_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1043,12 +1075,12 @@ L_aesni_xts_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L046enc1_loop_8: +L048enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L046enc1_loop_8 + jnz L048enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1072,14 +1104,14 @@ L046enc1_loop_8: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc L047xts_enc_short + jc L049xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L048xts_enc_loop6 + jmp L050xts_enc_loop6 .align 4,0x90 -L048xts_enc_loop6: +L050xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1168,23 +1200,23 @@ L048xts_enc_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L048xts_enc_loop6 + jnc L050xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L047xts_enc_short: +L049xts_enc_short: addl $96,%eax - jz L049xts_enc_done6x + jz L051xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L050xts_enc_one + jb L052xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L051xts_enc_two + je L053xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1193,7 +1225,7 @@ L047xts_enc_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L052xts_enc_three + jb L054xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1203,7 +1235,7 @@ L047xts_enc_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L053xts_enc_four + je L055xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1235,9 +1267,9 @@ L047xts_enc_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L050xts_enc_one: +L052xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1245,20 +1277,20 @@ L050xts_enc_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L055enc1_loop_9: +L057enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L055enc1_loop_9 + jnz L057enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L051xts_enc_two: +L053xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1272,9 +1304,9 @@ L051xts_enc_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L052xts_enc_three: +L054xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1292,9 +1324,9 @@ L052xts_enc_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L053xts_enc_four: +L055xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1316,28 +1348,28 @@ L053xts_enc_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L054xts_enc_done + jmp L056xts_enc_done .align 4,0x90 -L049xts_enc_done6x: +L051xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz L056xts_enc_ret + jz L058xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp L057xts_enc_steal + jmp L059xts_enc_steal .align 4,0x90 -L054xts_enc_done: +L056xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L056xts_enc_ret + jz L058xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -L057xts_enc_steal: +L059xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1345,7 +1377,7 @@ L057xts_enc_steal: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L057xts_enc_steal + jnz L059xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1355,16 +1387,16 @@ L057xts_enc_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L058enc1_loop_10: +L060enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L058enc1_loop_10 + jnz L060enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -L056xts_enc_ret: +L058xts_enc_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1385,11 +1417,11 @@ L056xts_enc_ret: popl %ebx popl %ebp ret -.globl _aesni_xts_decrypt -.private_extern _aesni_xts_decrypt +.globl _aes_hw_xts_decrypt +.private_extern _aes_hw_xts_decrypt .align 4 -_aesni_xts_decrypt: -L_aesni_xts_decrypt_begin: +_aes_hw_xts_decrypt: +L_aes_hw_xts_decrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1402,12 +1434,12 @@ L_aesni_xts_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L059enc1_loop_11: +L061enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L059enc1_loop_11 + jnz L061enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1436,14 +1468,14 @@ L059enc1_loop_11: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc L060xts_dec_short + jc L062xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L061xts_dec_loop6 + jmp L063xts_dec_loop6 .align 4,0x90 -L061xts_dec_loop6: +L063xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1532,23 +1564,23 @@ L061xts_dec_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L061xts_dec_loop6 + jnc L063xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L060xts_dec_short: +L062xts_dec_short: addl $96,%eax - jz L062xts_dec_done6x + jz L064xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L063xts_dec_one + jb L065xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L064xts_dec_two + je L066xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1557,7 +1589,7 @@ L060xts_dec_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L065xts_dec_three + jb L067xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1567,7 +1599,7 @@ L060xts_dec_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L066xts_dec_four + je L068xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1599,9 +1631,9 @@ L060xts_dec_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L063xts_dec_one: +L065xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1609,20 +1641,20 @@ L063xts_dec_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L068dec1_loop_12: +L070dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L068dec1_loop_12 + jnz L070dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L064xts_dec_two: +L066xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1636,9 +1668,9 @@ L064xts_dec_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L065xts_dec_three: +L067xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1656,9 +1688,9 @@ L065xts_dec_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L066xts_dec_four: +L068xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1680,20 +1712,20 @@ L066xts_dec_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L067xts_dec_done + jmp L069xts_dec_done .align 4,0x90 -L062xts_dec_done6x: +L064xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz L069xts_dec_ret + jz L071xts_dec_ret movl %eax,112(%esp) - jmp L070xts_dec_only_one_more + jmp L072xts_dec_only_one_more .align 4,0x90 -L067xts_dec_done: +L069xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L069xts_dec_ret + jz L071xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1703,7 +1735,7 @@ L067xts_dec_done: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -L070xts_dec_only_one_more: +L072xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1717,16 +1749,16 @@ L070xts_dec_only_one_more: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L071dec1_loop_13: +L073dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L071dec1_loop_13 + jnz L073dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -L072xts_dec_steal: +L074xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1734,7 +1766,7 @@ L072xts_dec_steal: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L072xts_dec_steal + jnz L074xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1744,16 +1776,16 @@ L072xts_dec_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L073dec1_loop_14: +L075dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L073dec1_loop_14 + jnz L075dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -L069xts_dec_ret: +L071xts_dec_ret: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -1774,11 +1806,11 @@ L069xts_dec_ret: popl %ebx popl %ebp ret -.globl _aesni_cbc_encrypt -.private_extern _aesni_cbc_encrypt +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt .align 4 -_aesni_cbc_encrypt: -L_aesni_cbc_encrypt_begin: +_aes_hw_cbc_encrypt: +L_aes_hw_cbc_encrypt_begin: pushl %ebp pushl %ebx pushl %esi @@ -1792,7 +1824,7 @@ L_aesni_cbc_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz L074cbc_abort + jz L076cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1800,14 +1832,14 @@ L_aesni_cbc_encrypt_begin: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je L075cbc_decrypt + je L077cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb L076cbc_enc_tail + jb L078cbc_enc_tail subl $16,%eax - jmp L077cbc_enc_loop + jmp L079cbc_enc_loop .align 4,0x90 -L077cbc_enc_loop: +L079cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1815,25 +1847,25 @@ L077cbc_enc_loop: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -L078enc1_loop_15: +L080enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L078enc1_loop_15 + jnz L080enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc L077cbc_enc_loop + jnc L079cbc_enc_loop addl $16,%eax - jnz L076cbc_enc_tail + jnz L078cbc_enc_tail movaps %xmm2,%xmm7 pxor %xmm2,%xmm2 - jmp L079cbc_ret -L076cbc_enc_tail: + jmp L081cbc_ret +L078cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1844,20 +1876,20 @@ L076cbc_enc_tail: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp L077cbc_enc_loop + jmp L079cbc_enc_loop .align 4,0x90 -L075cbc_decrypt: +L077cbc_decrypt: cmpl $80,%eax - jbe L080cbc_dec_tail + jbe L082cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp L081cbc_dec_loop6_enter + jmp L083cbc_dec_loop6_enter .align 4,0x90 -L082cbc_dec_loop6: +L084cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -L081cbc_dec_loop6_enter: +L083cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1887,28 +1919,28 @@ L081cbc_dec_loop6_enter: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja L082cbc_dec_loop6 + ja L084cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle L083cbc_dec_clear_tail_collected + jle L085cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -L080cbc_dec_tail: +L082cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe L084cbc_dec_one + jbe L086cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe L085cbc_dec_two + jbe L087cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe L086cbc_dec_three + jbe L088cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe L087cbc_dec_four + jbe L089cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1935,26 +1967,26 @@ L080cbc_dec_tail: movaps %xmm6,%xmm2 pxor %xmm6,%xmm6 subl $80,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L084cbc_dec_one: +L086cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L089dec1_loop_16: +L091dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L089dec1_loop_16 + jnz L091dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L085cbc_dec_two: +L087cbc_dec_two: call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1964,9 +1996,9 @@ L085cbc_dec_two: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L086cbc_dec_three: +L088cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1979,9 +2011,9 @@ L086cbc_dec_three: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L087cbc_dec_four: +L089cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1999,21 +2031,21 @@ L087cbc_dec_four: movaps %xmm5,%xmm2 pxor %xmm5,%xmm5 subl $64,%eax - jmp L088cbc_dec_tail_collected + jmp L090cbc_dec_tail_collected .align 4,0x90 -L083cbc_dec_clear_tail_collected: +L085cbc_dec_clear_tail_collected: pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 -L088cbc_dec_tail_collected: +L090cbc_dec_tail_collected: andl $15,%eax - jnz L090cbc_dec_tail_partial + jnz L092cbc_dec_tail_partial movups %xmm2,(%edi) pxor %xmm0,%xmm0 - jmp L079cbc_ret + jmp L081cbc_ret .align 4,0x90 -L090cbc_dec_tail_partial: +L092cbc_dec_tail_partial: movaps %xmm2,(%esp) pxor %xmm0,%xmm0 movl $16,%ecx @@ -2021,14 +2053,14 @@ L090cbc_dec_tail_partial: subl %eax,%ecx .long 2767451785 movdqa %xmm2,(%esp) -L079cbc_ret: +L081cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp pxor %xmm2,%xmm2 pxor %xmm1,%xmm1 movups %xmm7,(%ebp) pxor %xmm7,%xmm7 -L074cbc_abort: +L076cbc_abort: popl %edi popl %esi popl %ebx @@ -2040,13 +2072,13 @@ __aesni_set_encrypt_key: pushl %ebp pushl %ebx testl %eax,%eax - jz L091bad_pointer + jz L093bad_pointer testl %edx,%edx - jz L091bad_pointer - call L092pic -L092pic: + jz L093bad_pointer + call L094pic +L094pic: popl %ebx - leal Lkey_const-L092pic(%ebx),%ebx + leal Lkey_const-L094pic(%ebx),%ebx movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 @@ -2054,45 +2086,45 @@ L092pic: leal 16(%edx),%edx andl $268437504,%ebp cmpl $256,%ecx - je L09314rounds + je L09514rounds cmpl $192,%ecx - je L09412rounds + je L09612rounds cmpl $128,%ecx - jne L095bad_keybits + jne L097bad_keybits .align 4,0x90 -L09610rounds: +L09810rounds: cmpl $268435456,%ebp - je L09710rounds_alt + je L09910rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call L098key_128_cold + call L100key_128_cold .byte 102,15,58,223,200,2 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,4 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,8 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,16 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,32 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,64 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,128 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,27 - call L099key_128 + call L101key_128 .byte 102,15,58,223,200,54 - call L099key_128 + call L101key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L099key_128: +L101key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -L098key_128_cold: +L100key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2101,13 +2133,13 @@ L098key_128_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L09710rounds_alt: +L09910rounds_alt: movdqa (%ebx),%xmm5 movl $8,%ecx movdqa 32(%ebx),%xmm4 movdqa %xmm0,%xmm2 movdqu %xmm0,-16(%edx) -L101loop_key128: +L103loop_key128: .byte 102,15,56,0,197 .byte 102,15,56,221,196 pslld $1,%xmm4 @@ -2123,7 +2155,7 @@ L101loop_key128: movdqu %xmm0,-16(%edx) movdqa %xmm0,%xmm2 decl %ecx - jnz L101loop_key128 + jnz L103loop_key128 movdqa 48(%ebx),%xmm4 .byte 102,15,56,0,197 .byte 102,15,56,221,196 @@ -2151,41 +2183,41 @@ L101loop_key128: movdqu %xmm0,16(%edx) movl $9,%ecx movl %ecx,96(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L09412rounds: +L09612rounds: movq 16(%eax),%xmm2 cmpl $268435456,%ebp - je L10212rounds_alt + je L10412rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call L103key_192a_cold + call L105key_192a_cold .byte 102,15,58,223,202,2 - call L104key_192b + call L106key_192b .byte 102,15,58,223,202,4 - call L105key_192a + call L107key_192a .byte 102,15,58,223,202,8 - call L104key_192b + call L106key_192b .byte 102,15,58,223,202,16 - call L105key_192a + call L107key_192a .byte 102,15,58,223,202,32 - call L104key_192b + call L106key_192b .byte 102,15,58,223,202,64 - call L105key_192a + call L107key_192a .byte 102,15,58,223,202,128 - call L104key_192b + call L106key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L105key_192a: +L107key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 4,0x90 -L103key_192a_cold: +L105key_192a_cold: movaps %xmm2,%xmm5 -L106key_192b_warm: +L108key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2199,21 +2231,21 @@ L106key_192b_warm: pxor %xmm3,%xmm2 ret .align 4,0x90 -L104key_192b: +L106key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp L106key_192b_warm + jmp L108key_192b_warm .align 4,0x90 -L10212rounds_alt: +L10412rounds_alt: movdqa 16(%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $8,%ecx movdqu %xmm0,-16(%edx) -L107loop_key192: +L109loop_key192: movq %xmm2,(%edx) movdqa %xmm2,%xmm1 .byte 102,15,56,0,213 @@ -2235,54 +2267,54 @@ L107loop_key192: pxor %xmm3,%xmm2 movdqu %xmm0,-16(%edx) decl %ecx - jnz L107loop_key192 + jnz L109loop_key192 movl $11,%ecx movl %ecx,32(%edx) - jmp L100good_key + jmp L102good_key .align 4,0x90 -L09314rounds: +L09514rounds: movups 16(%eax),%xmm2 leal 16(%edx),%edx cmpl $268435456,%ebp - je L10814rounds_alt + je L11014rounds_alt movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call L109key_256a_cold + call L111key_256a_cold .byte 102,15,58,223,200,1 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,2 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,2 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,4 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,4 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,8 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,8 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,16 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,16 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,32 - call L111key_256a + call L113key_256a .byte 102,15,58,223,200,32 - call L110key_256b + call L112key_256b .byte 102,15,58,223,202,64 - call L111key_256a + call L113key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - jmp L100good_key + jmp L102good_key .align 4,0x90 -L111key_256a: +L113key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -L109key_256a_cold: +L111key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2291,7 +2323,7 @@ L109key_256a_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L110key_256b: +L112key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2302,14 +2334,14 @@ L110key_256b: xorps %xmm1,%xmm2 ret .align 4,0x90 -L10814rounds_alt: +L11014rounds_alt: movdqa (%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $7,%ecx movdqu %xmm0,-32(%edx) movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) -L112loop_key256: +L114loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 @@ -2323,7 +2355,7 @@ L112loop_key256: pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) decl %ecx - jz L113done_key256 + jz L115done_key256 pshufd $255,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -2338,11 +2370,11 @@ L112loop_key256: movdqu %xmm2,16(%edx) leal 32(%edx),%edx movdqa %xmm2,%xmm1 - jmp L112loop_key256 -L113done_key256: + jmp L114loop_key256 +L115done_key256: movl $13,%ecx movl %ecx,16(%edx) -L100good_key: +L102good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2354,33 +2386,45 @@ L100good_key: popl %ebp ret .align 2,0x90 -L091bad_pointer: +L093bad_pointer: movl $-1,%eax popl %ebx popl %ebp ret .align 2,0x90 -L095bad_keybits: +L097bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx popl %ebp ret -.globl _aesni_set_encrypt_key -.private_extern _aesni_set_encrypt_key +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key .align 4 -_aesni_set_encrypt_key: -L_aesni_set_encrypt_key_begin: +_aes_hw_set_encrypt_key: +L_aes_hw_set_encrypt_key_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L116pic +L116pic: + popl %ebx + leal _BORINGSSL_function_hit+3-L116pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx call __aesni_set_encrypt_key ret -.globl _aesni_set_decrypt_key -.private_extern _aesni_set_decrypt_key +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key .align 4 -_aesni_set_decrypt_key: -L_aesni_set_decrypt_key_begin: +_aes_hw_set_decrypt_key: +L_aes_hw_set_decrypt_key_begin: movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -2388,7 +2432,7 @@ L_aesni_set_decrypt_key_begin: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz L114dec_key_ret + jnz L117dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2396,7 +2440,7 @@ L_aesni_set_decrypt_key_begin: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -L115dec_key_inverse: +L118dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2406,14 +2450,14 @@ L115dec_key_inverse: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja L115dec_key_inverse + ja L118dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 xorl %eax,%eax -L114dec_key_ret: +L117dec_key_ret: ret .align 6,0x90 Lkey_const: diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S index d1be040546..7d0462b51f 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _bn_mul_add_words .private_extern _bn_mul_add_words diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S index 858ba3743e..578ca70b0c 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _bn_mul_comba8 .private_extern _bn_mul_comba8 diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S new file mode 100644 index 0000000000..f059e2839a --- /dev/null +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S @@ -0,0 +1,289 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.align 4 +_gcm_gmult_ssse3: +L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call L000pic_point +L000pic_point: + popl %eax + movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7 + movdqa Llow4_mask-L000pic_point(%eax),%xmm2 +.byte 102,15,56,0,199 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L001loop_row_1: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L002loop_row_2: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L003loop_row_3: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,0,215 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.align 4 +_gcm_ghash_ssse3: +L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call L004pic_point +L004pic_point: + popl %ebx + movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7 + andl $-16,%ecx +.byte 102,15,56,0,199 + pxor %xmm3,%xmm3 +L005loop_ghash: + movdqa Llow4_mask-L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 +.byte 102,15,56,0,207 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +L006loop_row_4: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L007loop_row_5: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L008loop_row_6: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz L005loop_ghash +.byte 102,15,56,0,199 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 4,0x90 +Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 4,0x90 +Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S index 320cd42b1a..e13bf3e858 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _gcm_gmult_4bit_mmx .private_extern _gcm_gmult_4bit_mmx diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S index 795e42e5c8..391acbd123 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _md5_block_asm_data_order .private_extern _md5_block_asm_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S index efb6f52e32..89c5d168e5 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _sha1_block_data_order .private_extern _sha1_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S index 7f15397e15..a974488943 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _sha256_block_data_order .private_extern _sha256_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S index f65cb1086a..a08e6ef5d7 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _sha512_block_data_order .private_extern _sha512_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S index f49e9f0a81..6b5a88b304 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S @@ -1,5 +1,13 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text +#ifdef BORINGSSL_DISPATCH_TEST +#endif .align 6,0x90 L_vpaes_consts: .long 218628480,235210255,168496130,67568393 @@ -460,6 +468,18 @@ L_vpaes_set_encrypt_key_begin: pushl %ebx pushl %esi pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L016pic +L016pic: + popl %ebx + leal _BORINGSSL_function_hit+5-L016pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%eax @@ -473,9 +493,9 @@ L_vpaes_set_encrypt_key_begin: movl %ebx,240(%edx) movl $48,%ecx movl $0,%edi - leal L_vpaes_consts+0x30-L016pic_point,%ebp + leal L_vpaes_consts+0x30-L017pic_point,%ebp call __vpaes_schedule_core -L016pic_point: +L017pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -510,9 +530,9 @@ L_vpaes_set_decrypt_key_begin: shrl $1,%ecx andl $32,%ecx xorl $32,%ecx - leal L_vpaes_consts+0x30-L017pic_point,%ebp + leal L_vpaes_consts+0x30-L018pic_point,%ebp call __vpaes_schedule_core -L017pic_point: +L018pic_point: movl 48(%esp),%esp xorl %eax,%eax popl %edi @@ -529,9 +549,21 @@ L_vpaes_encrypt_begin: pushl %ebx pushl %esi pushl %edi - leal L_vpaes_consts+0x30-L018pic_point,%ebp +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L019pic +L019pic: + popl %ebx + leal _BORINGSSL_function_hit+4-L019pic(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal L_vpaes_consts+0x30-L020pic_point,%ebp call __vpaes_preheat -L018pic_point: +L020pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -557,9 +589,9 @@ L_vpaes_decrypt_begin: pushl %ebx pushl %esi pushl %edi - leal L_vpaes_consts+0x30-L019pic_point,%ebp + leal L_vpaes_consts+0x30-L021pic_point,%ebp call __vpaes_preheat -L019pic_point: +L021pic_point: movl 20(%esp),%esi leal -56(%esp),%ebx movl 24(%esp),%edi @@ -590,7 +622,7 @@ L_vpaes_cbc_encrypt_begin: movl 28(%esp),%eax movl 32(%esp),%edx subl $16,%eax - jc L020cbc_abort + jc L022cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -603,14 +635,14 @@ L_vpaes_cbc_encrypt_begin: movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal L_vpaes_consts+0x30-L021pic_point,%ebp + leal L_vpaes_consts+0x30-L023pic_point,%ebp call __vpaes_preheat -L021pic_point: +L023pic_point: cmpl $0,%ecx - je L022cbc_dec_loop - jmp L023cbc_enc_loop + je L024cbc_dec_loop + jmp L025cbc_enc_loop .align 4,0x90 -L023cbc_enc_loop: +L025cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call __vpaes_encrypt_core @@ -620,10 +652,10 @@ L023cbc_enc_loop: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc L023cbc_enc_loop - jmp L024cbc_done + jnc L025cbc_enc_loop + jmp L026cbc_done .align 4,0x90 -L022cbc_dec_loop: +L024cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -635,12 +667,12 @@ L022cbc_dec_loop: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc L022cbc_dec_loop -L024cbc_done: + jnc L024cbc_dec_loop +L026cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) -L020cbc_abort: +L022cbc_abort: popl %edi popl %esi popl %ebx diff --git a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S index e7353ae252..3ef8774ed5 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S @@ -1,4 +1,10 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + #if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _bn_mul_mont .private_extern _bn_mul_mont @@ -445,16 +451,18 @@ L017sub: leal 1(%edx),%edx jge L017sub sbbl $0,%eax - andl %eax,%esi - notl %eax - movl %edi,%ebp - andl %eax,%ebp - orl %ebp,%esi + movl $-1,%edx + xorl %eax,%edx + jmp L018copy .align 4,0x90 L018copy: - movl (%esi,%ebx,4),%eax - movl %eax,(%edi,%ebx,4) + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) decl %ebx jge L018copy movl 24(%esp),%esp diff --git a/packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S b/packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S new file mode 100644 index 0000000000..601f2f0151 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86/crypto/test/trampoline-x86.S @@ -0,0 +1,169 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__i386__) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +L_abi_test_trampoline_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%ecx + movl (%ecx),%esi + movl 4(%ecx),%edi + movl 8(%ecx),%ebx + movl 12(%ecx),%ebp + subl $44,%esp + movl 72(%esp),%eax + xorl %ecx,%ecx +L000loop: + cmpl 76(%esp),%ecx + jae L001loop_done + movl (%eax,%ecx,4),%edx + movl %edx,(%esp,%ecx,4) + addl $1,%ecx + jmp L000loop +L001loop_done: + call *64(%esp) + addl $44,%esp + movl 24(%esp),%ecx + movl %esi,(%ecx) + movl %edi,4(%ecx) + movl %ebx,8(%ecx) + movl %ebp,12(%ecx) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _abi_test_get_and_clear_direction_flag +.private_extern _abi_test_get_and_clear_direction_flag +.align 4 +_abi_test_get_and_clear_direction_flag: +L_abi_test_get_and_clear_direction_flag_begin: + pushfl + popl %eax + andl $1024,%eax + shrl $10,%eax + cld + ret +.globl _abi_test_set_direction_flag +.private_extern _abi_test_set_direction_flag +.align 4 +_abi_test_set_direction_flag: +L_abi_test_set_direction_flag_begin: + std + ret +.globl _abi_test_clobber_eax +.private_extern _abi_test_clobber_eax +.align 4 +_abi_test_clobber_eax: +L_abi_test_clobber_eax_begin: + xorl %eax,%eax + ret +.globl _abi_test_clobber_ebx +.private_extern _abi_test_clobber_ebx +.align 4 +_abi_test_clobber_ebx: +L_abi_test_clobber_ebx_begin: + xorl %ebx,%ebx + ret +.globl _abi_test_clobber_ecx +.private_extern _abi_test_clobber_ecx +.align 4 +_abi_test_clobber_ecx: +L_abi_test_clobber_ecx_begin: + xorl %ecx,%ecx + ret +.globl _abi_test_clobber_edx +.private_extern _abi_test_clobber_edx +.align 4 +_abi_test_clobber_edx: +L_abi_test_clobber_edx_begin: + xorl %edx,%edx + ret +.globl _abi_test_clobber_edi +.private_extern _abi_test_clobber_edi +.align 4 +_abi_test_clobber_edi: +L_abi_test_clobber_edi_begin: + xorl %edi,%edi + ret +.globl _abi_test_clobber_esi +.private_extern _abi_test_clobber_esi +.align 4 +_abi_test_clobber_esi: +L_abi_test_clobber_esi_begin: + xorl %esi,%esi + ret +.globl _abi_test_clobber_ebp +.private_extern _abi_test_clobber_ebp +.align 4 +_abi_test_clobber_ebp: +L_abi_test_clobber_ebp_begin: + xorl %ebp,%ebp + ret +.globl _abi_test_clobber_xmm0 +.private_extern _abi_test_clobber_xmm0 +.align 4 +_abi_test_clobber_xmm0: +L_abi_test_clobber_xmm0_begin: + pxor %xmm0,%xmm0 + ret +.globl _abi_test_clobber_xmm1 +.private_extern _abi_test_clobber_xmm1 +.align 4 +_abi_test_clobber_xmm1: +L_abi_test_clobber_xmm1_begin: + pxor %xmm1,%xmm1 + ret +.globl _abi_test_clobber_xmm2 +.private_extern _abi_test_clobber_xmm2 +.align 4 +_abi_test_clobber_xmm2: +L_abi_test_clobber_xmm2_begin: + pxor %xmm2,%xmm2 + ret +.globl _abi_test_clobber_xmm3 +.private_extern _abi_test_clobber_xmm3 +.align 4 +_abi_test_clobber_xmm3: +L_abi_test_clobber_xmm3_begin: + pxor %xmm3,%xmm3 + ret +.globl _abi_test_clobber_xmm4 +.private_extern _abi_test_clobber_xmm4 +.align 4 +_abi_test_clobber_xmm4: +L_abi_test_clobber_xmm4_begin: + pxor %xmm4,%xmm4 + ret +.globl _abi_test_clobber_xmm5 +.private_extern _abi_test_clobber_xmm5 +.align 4 +_abi_test_clobber_xmm5: +L_abi_test_clobber_xmm5_begin: + pxor %xmm5,%xmm5 + ret +.globl _abi_test_clobber_xmm6 +.private_extern _abi_test_clobber_xmm6 +.align 4 +_abi_test_clobber_xmm6: +L_abi_test_clobber_xmm6_begin: + pxor %xmm6,%xmm6 + ret +.globl _abi_test_clobber_xmm7 +.private_extern _abi_test_clobber_xmm7 +.align 4 +_abi_test_clobber_xmm7: +L_abi_test_clobber_xmm7_begin: + pxor %xmm7,%xmm7 + ret +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S index 30edc7b5e2..10b1ad9520 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -37,6 +49,7 @@ L$sixteen: .p2align 6 _ChaCha20_ctr32: + cmpq $0,%rdx je L$no_data movq _OPENSSL_ia32cap_P+4(%rip),%r10 @@ -44,12 +57,19 @@ _ChaCha20_ctr32: jnz L$ChaCha20_ssse3 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $64+24,%rsp + L$ctr32_body: @@ -290,20 +310,30 @@ L$oop_tail: L$done: leaq 64+24+48(%rsp),%rsi movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$no_data: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_ssse3: L$ChaCha20_ssse3: + movq %rsp,%r9 + cmpq $128,%rdx ja L$ChaCha20_4x @@ -429,14 +459,18 @@ L$oop_tail_ssse3: L$done_ssse3: leaq (%r9),%rsp + L$ssse3_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_4x: L$ChaCha20_4x: + movq %rsp,%r9 + movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -977,14 +1011,18 @@ L$oop_tail4x: L$done4x: leaq (%r9),%rsp + L$4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 ChaCha20_8x: L$ChaCha20_8x: + movq %rsp,%r9 + subq $0x280+8,%rsp andq $-32,%rsp vzeroupper @@ -1579,7 +1617,9 @@ L$oop_tail8x: L$done8x: vzeroall leaq (%r9),%rsp + L$8x_epilogue: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S index c8a5262c8d..0c921b37b5 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .data .p2align 4 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index c90447ac45..e50227ae38 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S index c7c4829fa0..8875d0abbb 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .p2align 4 @@ -156,6 +168,7 @@ L$enc_loop: .p2align 4 _x86_64_AES_encrypt_compact: + leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -326,21 +339,30 @@ L$enc_compact_done: xorl 12(%r15),%edx .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_encrypt -.private_extern _asm_AES_encrypt -.private_extern _asm_AES_encrypt -_asm_AES_encrypt: +.p2align 4 +.globl _aes_nohw_encrypt +.private_extern _aes_nohw_encrypt + +.private_extern _aes_nohw_encrypt +_aes_nohw_encrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -351,6 +373,7 @@ _asm_AES_encrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) + L$enc_prologue: movq %rdx,%r15 @@ -377,22 +400,31 @@ L$enc_prologue: movq 16(%rsp),%r9 movq 24(%rsp),%rsi + movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$enc_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_AES_decrypt: xorl 0(%r15),%eax @@ -550,6 +582,7 @@ L$dec_loop: .p2align 4 _x86_64_AES_decrypt_compact: + leaq 128(%r14),%r8 movl 0-128(%r8),%edi movl 32-128(%r8),%ebp @@ -772,21 +805,30 @@ L$dec_compact_done: xorl 12(%r15),%edx .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_decrypt -.private_extern _asm_AES_decrypt -.private_extern _asm_AES_decrypt -_asm_AES_decrypt: +.p2align 4 +.globl _aes_nohw_decrypt +.private_extern _aes_nohw_decrypt + +.private_extern _aes_nohw_decrypt +_aes_nohw_decrypt: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -797,6 +839,7 @@ _asm_AES_decrypt: movq %rsi,16(%rsp) movq %rax,24(%rsp) + L$dec_prologue: movq %rdx,%r15 @@ -825,47 +868,69 @@ L$dec_prologue: movq 16(%rsp),%r9 movq 24(%rsp),%rsi + movl %eax,0(%r9) movl %ebx,4(%r9) movl %ecx,8(%r9) movl %edx,12(%r9) movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$dec_epilogue: .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_set_encrypt_key -.private_extern _asm_AES_set_encrypt_key -_asm_AES_set_encrypt_key: +.p2align 4 +.globl _aes_nohw_set_encrypt_key +.private_extern _aes_nohw_set_encrypt_key + +_aes_nohw_set_encrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8,%rsp + L$enc_key_prologue: call _x86_64_AES_set_encrypt_key movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp + L$enc_key_epilogue: .byte 0xf3,0xc3 + .p2align 4 _x86_64_AES_set_encrypt_key: + movl %esi,%ecx movq %rdi,%rsi movq %rdx,%rdi @@ -1102,18 +1167,27 @@ L$badpointer: L$exit: .byte 0xf3,0xc3 -.p2align 4 -.globl _asm_AES_set_decrypt_key -.private_extern _asm_AES_set_decrypt_key -_asm_AES_set_decrypt_key: +.p2align 4 +.globl _aes_nohw_set_decrypt_key +.private_extern _aes_nohw_set_decrypt_key + +_aes_nohw_set_decrypt_key: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdx + L$dec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1281,31 +1355,49 @@ L$permute: xorq %rax,%rax L$abort: movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + addq $56,%rsp + L$dec_key_epilogue: .byte 0xf3,0xc3 + .p2align 4 -.globl _asm_AES_cbc_encrypt -.private_extern _asm_AES_cbc_encrypt +.globl _aes_nohw_cbc_encrypt +.private_extern _aes_nohw_cbc_encrypt -.private_extern _asm_AES_cbc_encrypt -_asm_AES_cbc_encrypt: +.private_extern _aes_nohw_cbc_encrypt +_aes_nohw_cbc_encrypt: + cmpq $0,%rdx je L$cbc_epilogue pushfq + + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + L$cbc_prologue: cld @@ -1316,6 +1408,7 @@ L$cbc_prologue: cmpq $0,%r9 cmoveq %r10,%r14 + leaq _OPENSSL_ia32cap_P(%rip),%r10 movl (%r10),%r10d cmpq $512,%rdx @@ -1352,7 +1445,9 @@ L$cbc_te_ok: xchgq %rsp,%r15 + movq %r15,16(%rsp) + L$cbc_fast_body: movq %rdi,24(%rsp) movq %rsi,32(%rsp) @@ -1551,6 +1646,7 @@ L$cbc_fast_cleanup: .p2align 4 L$cbc_slow_prologue: + leaq -88(%rsp),%rbp andq $-64,%rbp @@ -1562,7 +1658,9 @@ L$cbc_slow_prologue: xchgq %rsp,%rbp + movq %rbp,16(%rsp) + L$cbc_slow_body: @@ -1734,18 +1832,30 @@ L$cbc_slow_dec_partial: .p2align 4 L$cbc_exit: movq 16(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp + L$cbc_popfq: popfq + + + L$cbc_epilogue: .byte 0xf3,0xc3 + .p2align 6 L$AES_Te: .long 0xa56363c6,0xa56363c6 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S index 2513904cf1..b08a2fbbf9 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -544,6 +556,10 @@ L$handle_ctr32_2: .p2align 5 _aesni_gcm_encrypt: +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+2(%rip) +#endif xorq %r10,%r10 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S index 4ee0dc49c2..58e072ee1b 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -1,11 +1,28 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text -.globl _aesni_encrypt -.private_extern _aesni_encrypt +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt .p2align 4 -_aesni_encrypt: +_aes_hw_encrypt: + +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+1(%rip) +#endif movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -26,11 +43,13 @@ L$oop_enc1_1: .byte 0xf3,0xc3 -.globl _aesni_decrypt -.private_extern _aesni_decrypt + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt .p2align 4 -_aesni_decrypt: +_aes_hw_decrypt: + movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -51,8 +70,10 @@ L$oop_dec1_2: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -80,8 +101,10 @@ L$enc_loop2: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt2: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -109,8 +132,10 @@ L$dec_loop2: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -143,8 +168,10 @@ L$enc_loop3: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt3: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -177,8 +204,10 @@ L$dec_loop3: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -217,8 +246,10 @@ L$enc_loop4: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt4: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -257,8 +288,10 @@ L$dec_loop4: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -311,8 +344,10 @@ L$enc_loop6_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt6: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -365,8 +400,10 @@ L$dec_loop6_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_encrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -429,8 +466,10 @@ L$enc_loop8_enter: .byte 0xf3,0xc3 + .p2align 4 _aesni_decrypt8: + movups (%rcx),%xmm0 shll $4,%eax movups 16(%rcx),%xmm1 @@ -492,11 +531,13 @@ L$dec_loop8_enter: .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 -.globl _aesni_ecb_encrypt -.private_extern _aesni_ecb_encrypt + +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt .p2align 4 -_aesni_ecb_encrypt: +_aes_hw_ecb_encrypt: + andq $-16,%rdx jz L$ecb_ret @@ -835,173 +876,16 @@ L$ecb_ret: pxor %xmm1,%xmm1 .byte 0xf3,0xc3 -.globl _aesni_ccm64_encrypt_blocks -.private_extern _aesni_ccm64_encrypt_blocks + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks .p2align 4 -_aesni_ccm64_encrypt_blocks: - movl 240(%rcx),%eax - movdqu (%r8),%xmm6 - movdqa L$increment64(%rip),%xmm9 - movdqa L$bswap_mask(%rip),%xmm7 +_aes_hw_ctr32_encrypt_blocks: - shll $4,%eax - movl $16,%r10d - leaq 0(%rcx),%r11 - movdqu (%r9),%xmm3 - movdqa %xmm6,%xmm2 - leaq 32(%rcx,%rax,1),%rcx -.byte 102,15,56,0,247 - subq %rax,%r10 - jmp L$ccm64_enc_outer -.p2align 4 -L$ccm64_enc_outer: - movups (%r11),%xmm0 - movq %r10,%rax - movups (%rdi),%xmm8 - - xorps %xmm0,%xmm2 - movups 16(%r11),%xmm1 - xorps %xmm8,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%r11),%xmm0 - -L$ccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq %xmm9,%xmm6 - decq %rdx -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - - leaq 16(%rdi),%rdi - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) -.byte 102,15,56,0,215 - leaq 16(%rsi),%rsi - jnz L$ccm64_enc_outer - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - -.globl _aesni_ccm64_decrypt_blocks -.private_extern _aesni_ccm64_decrypt_blocks - -.p2align 4 -_aesni_ccm64_decrypt_blocks: - movl 240(%rcx),%eax - movups (%r8),%xmm6 - movdqu (%r9),%xmm3 - movdqa L$increment64(%rip),%xmm9 - movdqa L$bswap_mask(%rip),%xmm7 - - movaps %xmm6,%xmm2 - movl %eax,%r10d - movq %rcx,%r11 -.byte 102,15,56,0,247 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_5: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_5 -.byte 102,15,56,221,209 - shll $4,%r10d - movl $16,%eax - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 - leaq 16(%rdi),%rdi - subq %r10,%rax - leaq 32(%r11,%r10,1),%rcx - movq %rax,%r10 - jmp L$ccm64_dec_outer -.p2align 4 -L$ccm64_dec_outer: - xorps %xmm2,%xmm8 - movdqa %xmm6,%xmm2 - movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi -.byte 102,15,56,0,215 - - subq $1,%rdx - jz L$ccm64_dec_break - - movups (%r11),%xmm0 - movq %r10,%rax - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - xorps %xmm0,%xmm2 - xorps %xmm8,%xmm3 - movups 32(%r11),%xmm0 - jmp L$ccm64_dec2_loop -.p2align 4 -L$ccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ccm64_dec2_loop - movups (%rdi),%xmm8 - paddq %xmm9,%xmm6 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leaq 16(%rdi),%rdi - jmp L$ccm64_dec_outer - -.p2align 4 -L$ccm64_dec_break: - - movl 240(%r11),%eax - movups (%r11),%xmm0 - movups 16(%r11),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%r11),%r11 - xorps %xmm8,%xmm3 -L$oop_enc1_6: -.byte 102,15,56,220,217 - decl %eax - movups (%r11),%xmm1 - leaq 16(%r11),%r11 - jnz L$oop_enc1_6 -.byte 102,15,56,221,217 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movups %xmm3,(%r9) - pxor %xmm3,%xmm3 - pxor %xmm8,%xmm8 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - -.globl _aesni_ctr32_encrypt_blocks -.private_extern _aesni_ctr32_encrypt_blocks - -.p2align 4 -_aesni_ctr32_encrypt_blocks: +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit(%rip) +#endif cmpq $1,%rdx jne L$ctr32_bulk @@ -1014,12 +898,12 @@ _aesni_ctr32_encrypt_blocks: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_enc1_7: +L$oop_enc1_5: .byte 102,15,56,220,209 decl %edx movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_7 + jnz L$oop_enc1_5 .byte 102,15,56,221,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -1032,7 +916,9 @@ L$oop_enc1_7: .p2align 4 L$ctr32_bulk: leaq (%rsp),%r11 + pushq %rbp + subq $128,%rsp andq $-16,%rsp @@ -1567,1798 +1453,19 @@ L$ctr32_done: movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 movq -8(%r11),%rbp + leaq (%r11),%rsp + L$ctr32_epilogue: .byte 0xf3,0xc3 -.globl _aesni_xts_encrypt -.private_extern _aesni_xts_encrypt + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt .p2align 4 -_aesni_xts_encrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -L$oop_enc1_8: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz L$oop_enc1_8 -.byte 102,15,56,221,209 - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx +_aes_hw_cbc_encrypt: - movups 16(%rcx,%r10,1),%xmm1 - - movdqa L$xts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc L$xts_enc_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq L$xts_magic(%rip),%r8 - jmp L$xts_enc_grandloop - -.p2align 5 -L$xts_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,220,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,220,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,220,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,220,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,220,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp L$xts_enc_loop6 -.p2align 5 -L$xts_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz L$xts_enc_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,220,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,220,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,220,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,220,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,220,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,220,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,220,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,220,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,221,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,221,92,36,16 -.byte 102,15,56,221,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,221,108,36,48 -.byte 102,15,56,221,116,36,64 -.byte 102,15,56,221,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc L$xts_enc_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -L$xts_enc_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - addq $96,%rdx - jz L$xts_enc_done - - pxor %xmm0,%xmm11 - cmpq $0x20,%rdx - jb L$xts_enc_one - pxor %xmm0,%xmm12 - je L$xts_enc_two - - pxor %xmm0,%xmm13 - cmpq $0x40,%rdx - jb L$xts_enc_three - pxor %xmm0,%xmm14 - je L$xts_enc_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm7 - - call _aesni_encrypt6 - - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_9: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_9 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_encrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_encrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_encrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp L$xts_enc_done - -.p2align 4 -L$xts_enc_done: - andq $15,%r9 - jz L$xts_enc_ret - movq %r9,%rdx - -L$xts_enc_steal: - movzbl (%rdi),%eax - movzbl -16(%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,-16(%rsi) - movb %cl,0(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz L$xts_enc_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups -16(%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_10: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_10 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,-16(%rsi) - -L$xts_enc_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -L$xts_enc_epilogue: - .byte 0xf3,0xc3 - -.globl _aesni_xts_decrypt -.private_extern _aesni_xts_decrypt - -.p2align 4 -_aesni_xts_decrypt: - leaq (%rsp),%r11 - pushq %rbp - subq $112,%rsp - andq $-16,%rsp - movups (%r9),%xmm2 - movl 240(%r8),%eax - movl 240(%rcx),%r10d - movups (%r8),%xmm0 - movups 16(%r8),%xmm1 - leaq 32(%r8),%r8 - xorps %xmm0,%xmm2 -L$oop_enc1_11: -.byte 102,15,56,220,209 - decl %eax - movups (%r8),%xmm1 - leaq 16(%r8),%r8 - jnz L$oop_enc1_11 -.byte 102,15,56,221,209 - xorl %eax,%eax - testq $15,%rdx - setnz %al - shlq $4,%rax - subq %rax,%rdx - - movups (%rcx),%xmm0 - movq %rcx,%rbp - movl %r10d,%eax - shll $4,%r10d - movq %rdx,%r9 - andq $-16,%rdx - - movups 16(%rcx,%r10,1),%xmm1 - - movdqa L$xts_magic(%rip),%xmm8 - movdqa %xmm2,%xmm15 - pshufd $0x5f,%xmm2,%xmm9 - pxor %xmm0,%xmm1 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm10 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm10 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm11 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm11 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm12 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm12 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 - movdqa %xmm15,%xmm13 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 - pxor %xmm0,%xmm13 - pxor %xmm14,%xmm15 - movdqa %xmm15,%xmm14 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pxor %xmm0,%xmm14 - pxor %xmm9,%xmm15 - movaps %xmm1,96(%rsp) - - subq $96,%rdx - jc L$xts_dec_short - - movl $16+96,%eax - leaq 32(%rbp,%r10,1),%rcx - subq %r10,%rax - movups 16(%rbp),%xmm1 - movq %rax,%r10 - leaq L$xts_magic(%rip),%r8 - jmp L$xts_dec_grandloop - -.p2align 5 -L$xts_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqa %xmm0,%xmm8 - movdqu 16(%rdi),%xmm3 - pxor %xmm10,%xmm2 - movdqu 32(%rdi),%xmm4 - pxor %xmm11,%xmm3 -.byte 102,15,56,222,209 - movdqu 48(%rdi),%xmm5 - pxor %xmm12,%xmm4 -.byte 102,15,56,222,217 - movdqu 64(%rdi),%xmm6 - pxor %xmm13,%xmm5 -.byte 102,15,56,222,225 - movdqu 80(%rdi),%xmm7 - pxor %xmm15,%xmm8 - movdqa 96(%rsp),%xmm9 - pxor %xmm14,%xmm6 -.byte 102,15,56,222,233 - movups 32(%rbp),%xmm0 - leaq 96(%rdi),%rdi - pxor %xmm8,%xmm7 - - pxor %xmm9,%xmm10 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm11 - movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,249 - movups 48(%rbp),%xmm1 - pxor %xmm9,%xmm12 - -.byte 102,15,56,222,208 - pxor %xmm9,%xmm13 - movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,216 - pxor %xmm9,%xmm14 - movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pxor %xmm9,%xmm8 - movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%rbp),%xmm0 - movdqa %xmm8,80(%rsp) - pshufd $0x5f,%xmm15,%xmm9 - jmp L$xts_dec_loop6 -.p2align 5 -L$xts_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -64(%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -80(%rcx,%rax,1),%xmm0 - jnz L$xts_dec_loop6 - - movdqa (%r8),%xmm8 - movdqa %xmm9,%xmm14 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - paddq %xmm15,%xmm15 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - pand %xmm8,%xmm14 - movups (%rbp),%xmm10 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 - pxor %xmm14,%xmm15 - movaps %xmm10,%xmm11 -.byte 102,15,56,222,249 - movups -64(%rcx),%xmm1 - - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,208 - paddd %xmm9,%xmm9 - pxor %xmm15,%xmm10 -.byte 102,15,56,222,216 - psrad $31,%xmm14 - paddq %xmm15,%xmm15 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - pand %xmm8,%xmm14 - movaps %xmm11,%xmm12 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,248 - movups -48(%rcx),%xmm0 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm11 - psrad $31,%xmm14 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movdqa %xmm13,48(%rsp) - pxor %xmm14,%xmm15 -.byte 102,15,56,222,241 - movaps %xmm12,%xmm13 - movdqa %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups -32(%rcx),%xmm1 - - paddd %xmm9,%xmm9 -.byte 102,15,56,222,208 - pxor %xmm15,%xmm12 - psrad $31,%xmm14 -.byte 102,15,56,222,216 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm14 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 - pxor %xmm14,%xmm15 - movaps %xmm13,%xmm14 -.byte 102,15,56,222,248 - - movdqa %xmm9,%xmm0 - paddd %xmm9,%xmm9 -.byte 102,15,56,222,209 - pxor %xmm15,%xmm13 - psrad $31,%xmm0 -.byte 102,15,56,222,217 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm15 - movups (%rbp),%xmm0 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%rbp),%xmm1 - - pxor %xmm15,%xmm14 -.byte 102,15,56,223,84,36,0 - psrad $31,%xmm9 - paddq %xmm15,%xmm15 -.byte 102,15,56,223,92,36,16 -.byte 102,15,56,223,100,36,32 - pand %xmm8,%xmm9 - movq %r10,%rax -.byte 102,15,56,223,108,36,48 -.byte 102,15,56,223,116,36,64 -.byte 102,15,56,223,124,36,80 - pxor %xmm9,%xmm15 - - leaq 96(%rsi),%rsi - movups %xmm2,-96(%rsi) - movups %xmm3,-80(%rsi) - movups %xmm4,-64(%rsi) - movups %xmm5,-48(%rsi) - movups %xmm6,-32(%rsi) - movups %xmm7,-16(%rsi) - subq $96,%rdx - jnc L$xts_dec_grandloop - - movl $16+96,%eax - subl %r10d,%eax - movq %rbp,%rcx - shrl $4,%eax - -L$xts_dec_short: - - movl %eax,%r10d - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 - addq $96,%rdx - jz L$xts_dec_done - - pxor %xmm0,%xmm12 - cmpq $0x20,%rdx - jb L$xts_dec_one - pxor %xmm0,%xmm13 - je L$xts_dec_two - - pxor %xmm0,%xmm14 - cmpq $0x40,%rdx - jb L$xts_dec_three - je L$xts_dec_four - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 - pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 - leaq 80(%rdi),%rdi - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm6 - - call _aesni_decrypt6 - - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - xorps %xmm14,%xmm6 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm14 - movdqu %xmm5,48(%rsi) - pcmpgtd %xmm15,%xmm14 - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - pshufd $0x13,%xmm14,%xmm11 - andq $15,%r9 - jz L$xts_dec_ret - - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 - pand %xmm8,%xmm11 - pxor %xmm15,%xmm11 - jmp L$xts_dec_done2 - -.p2align 4 -L$xts_dec_one: - movups (%rdi),%xmm2 - leaq 16(%rdi),%rdi - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_12: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_12 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movdqa %xmm11,%xmm10 - movups %xmm2,(%rsi) - movdqa %xmm12,%xmm11 - leaq 16(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_two: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - leaq 32(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - - call _aesni_decrypt2 - - xorps %xmm10,%xmm2 - movdqa %xmm12,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm13,%xmm11 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - leaq 32(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_three: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - leaq 48(%rdi),%rdi - xorps %xmm10,%xmm2 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - - call _aesni_decrypt3 - - xorps %xmm10,%xmm2 - movdqa %xmm13,%xmm10 - xorps %xmm11,%xmm3 - movdqa %xmm14,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - leaq 48(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_four: - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 32(%rdi),%xmm4 - xorps %xmm10,%xmm2 - movups 48(%rdi),%xmm5 - leaq 64(%rdi),%rdi - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - xorps %xmm13,%xmm5 - - call _aesni_decrypt4 - - pxor %xmm10,%xmm2 - movdqa %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqa %xmm15,%xmm11 - pxor %xmm12,%xmm4 - movdqu %xmm2,(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm3,16(%rsi) - movdqu %xmm4,32(%rsi) - movdqu %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - jmp L$xts_dec_done - -.p2align 4 -L$xts_dec_done: - andq $15,%r9 - jz L$xts_dec_ret -L$xts_dec_done2: - movq %r9,%rdx - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rdi),%xmm2 - xorps %xmm11,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_13: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_13 -.byte 102,15,56,223,209 - xorps %xmm11,%xmm2 - movups %xmm2,(%rsi) - -L$xts_dec_steal: - movzbl 16(%rdi),%eax - movzbl (%rsi),%ecx - leaq 1(%rdi),%rdi - movb %al,(%rsi) - movb %cl,16(%rsi) - leaq 1(%rsi),%rsi - subq $1,%rdx - jnz L$xts_dec_steal - - subq %r9,%rsi - movq %rbp,%rcx - movl %r10d,%eax - - movups (%rsi),%xmm2 - xorps %xmm10,%xmm2 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_14: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_14 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - -L$xts_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - leaq (%r11),%rsp -L$xts_dec_epilogue: - .byte 0xf3,0xc3 - -.globl _aesni_ocb_encrypt -.private_extern _aesni_ocb_encrypt - -.p2align 5 -_aesni_ocb_encrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz L$ocb_enc_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - leaq 16(%rsi),%rsi - subq $1,%rdx - jz L$ocb_enc_done - -L$ocb_enc_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc L$ocb_enc_short - jmp L$ocb_enc_grandloop - -.p2align 5 -L$ocb_enc_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_encrypt6 - - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc L$ocb_enc_grandloop - -L$ocb_enc_short: - addq $6,%rdx - jz L$ocb_enc_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb L$ocb_enc_one - movdqu 16(%rdi),%xmm3 - je L$ocb_enc_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb L$ocb_enc_three - movdqu 48(%rdi),%xmm5 - je L$ocb_enc_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_encrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_one: - movdqa %xmm10,%xmm7 - - call __ocb_encrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_three: - pxor %xmm5,%xmm5 - - call __ocb_encrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - - jmp L$ocb_enc_done - -.p2align 4 -L$ocb_enc_four: - call __ocb_encrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - -L$ocb_enc_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$ocb_enc_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm6,%xmm8 - pxor %xmm14,%xmm6 - pxor %xmm7,%xmm8 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,220,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,220,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp L$ocb_enc_loop6 - -.p2align 5 -L$ocb_enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,221,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm2,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm3,%xmm8 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm4,%xmm8 - pxor %xmm12,%xmm4 - pxor %xmm5,%xmm8 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups 64(%r11),%xmm0 - jmp L$ocb_enc_loop4 - -.p2align 5 -L$ocb_enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,221,210 -.byte 102,65,15,56,221,219 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_encrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm2,%xmm8 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,220,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,220,208 - movups 64(%r11),%xmm0 - jmp L$ocb_enc_loop1 - -.p2align 5 -L$ocb_enc_loop1: -.byte 102,15,56,220,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,220,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_enc_loop1 - -.byte 102,15,56,220,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,221,215 - .byte 0xf3,0xc3 - - -.globl _aesni_ocb_decrypt -.private_extern _aesni_ocb_decrypt - -.p2align 5 -_aesni_ocb_decrypt: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq 8(%rax),%rbx - movq 8+8(%rax),%rbp - - movl 240(%rcx),%r10d - movq %rcx,%r11 - shll $4,%r10d - movups (%rcx),%xmm9 - movups 16(%rcx,%r10,1),%xmm1 - - movdqu (%r9),%xmm15 - pxor %xmm1,%xmm9 - pxor %xmm1,%xmm15 - - movl $16+32,%eax - leaq 32(%r11,%r10,1),%rcx - movups 16(%r11),%xmm1 - subq %r10,%rax - movq %rax,%r10 - - movdqu (%rbx),%xmm10 - movdqu (%rbp),%xmm8 - - testq $1,%r8 - jnz L$ocb_dec_odd - - bsfq %r8,%r12 - addq $1,%r8 - shlq $4,%r12 - movdqu (%rbx,%r12,1),%xmm7 - movdqu (%rdi),%xmm2 - leaq 16(%rdi),%rdi - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm8 - leaq 16(%rsi),%rsi - subq $1,%rdx - jz L$ocb_dec_done - -L$ocb_dec_odd: - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - leaq 6(%r8),%r8 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - shlq $4,%r12 - shlq $4,%r13 - shlq $4,%r14 - - subq $6,%rdx - jc L$ocb_dec_short - jmp L$ocb_dec_grandloop - -.p2align 5 -L$ocb_dec_grandloop: - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi - - call __ocb_decrypt6 - - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm8 - leaq 96(%rsi),%rsi - subq $6,%rdx - jnc L$ocb_dec_grandloop - -L$ocb_dec_short: - addq $6,%rdx - jz L$ocb_dec_done - - movdqu 0(%rdi),%xmm2 - cmpq $2,%rdx - jb L$ocb_dec_one - movdqu 16(%rdi),%xmm3 - je L$ocb_dec_two - - movdqu 32(%rdi),%xmm4 - cmpq $4,%rdx - jb L$ocb_dec_three - movdqu 48(%rdi),%xmm5 - je L$ocb_dec_four - - movdqu 64(%rdi),%xmm6 - pxor %xmm7,%xmm7 - - call __ocb_decrypt6 - - movdqa %xmm14,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_one: - movdqa %xmm10,%xmm7 - - call __ocb_decrypt1 - - movdqa %xmm7,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_two: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm11,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_three: - pxor %xmm5,%xmm5 - - call __ocb_decrypt4 - - movdqa %xmm12,%xmm15 - movups %xmm2,0(%rsi) - xorps %xmm2,%xmm8 - movups %xmm3,16(%rsi) - xorps %xmm3,%xmm8 - movups %xmm4,32(%rsi) - xorps %xmm4,%xmm8 - - jmp L$ocb_dec_done - -.p2align 4 -L$ocb_dec_four: - call __ocb_decrypt4 - - movdqa %xmm13,%xmm15 - movups %xmm2,0(%rsi) - pxor %xmm2,%xmm8 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm8 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm8 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm8 - -L$ocb_dec_done: - pxor %xmm0,%xmm15 - movdqu %xmm8,(%rbp) - movdqu %xmm15,(%r9) - - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - leaq 40(%rsp),%rax - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$ocb_dec_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt6: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - movdqa %xmm10,%xmm14 - pxor %xmm15,%xmm10 - movdqu (%rbx,%r14,1),%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm14 - pxor %xmm13,%xmm5 - pxor %xmm14,%xmm15 - pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - movups 32(%r11),%xmm0 - - leaq 1(%r8),%r12 - leaq 3(%r8),%r13 - leaq 5(%r8),%r14 - addq $6,%r8 - pxor %xmm9,%xmm10 - bsfq %r12,%r12 - bsfq %r13,%r13 - bsfq %r14,%r14 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 -.byte 102,15,56,222,241 - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm14 -.byte 102,15,56,222,249 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm15 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups 64(%r11),%xmm0 - shlq $4,%r12 - shlq $4,%r13 - jmp L$ocb_dec_loop6 - -.p2align 5 -L$ocb_dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups 16(%r11),%xmm1 - shlq $4,%r14 - -.byte 102,65,15,56,223,210 - movdqu (%rbx),%xmm10 - movq %r10,%rax -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 -.byte 102,65,15,56,223,255 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt4: - pxor %xmm9,%xmm15 - movdqu (%rbx,%r12,1),%xmm11 - movdqa %xmm10,%xmm12 - movdqu (%rbx,%r13,1),%xmm13 - pxor %xmm15,%xmm10 - pxor %xmm10,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm12 - pxor %xmm11,%xmm3 - pxor %xmm12,%xmm13 - pxor %xmm12,%xmm4 - pxor %xmm13,%xmm5 - movups 32(%r11),%xmm0 - - pxor %xmm9,%xmm10 - pxor %xmm9,%xmm11 - pxor %xmm9,%xmm12 - pxor %xmm9,%xmm13 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 48(%r11),%xmm1 - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups 64(%r11),%xmm0 - jmp L$ocb_dec_loop4 - -.p2align 5 -L$ocb_dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,65,15,56,223,210 -.byte 102,65,15,56,223,219 -.byte 102,65,15,56,223,228 -.byte 102,65,15,56,223,237 - .byte 0xf3,0xc3 - - - -.p2align 5 -__ocb_decrypt1: - pxor %xmm15,%xmm7 - pxor %xmm9,%xmm7 - pxor %xmm7,%xmm2 - movups 32(%r11),%xmm0 - -.byte 102,15,56,222,209 - movups 48(%r11),%xmm1 - pxor %xmm9,%xmm7 - -.byte 102,15,56,222,208 - movups 64(%r11),%xmm0 - jmp L$ocb_dec_loop1 - -.p2align 5 -L$ocb_dec_loop1: -.byte 102,15,56,222,209 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax - -.byte 102,15,56,222,208 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$ocb_dec_loop1 - -.byte 102,15,56,222,209 - movups 16(%r11),%xmm1 - movq %r10,%rax - -.byte 102,15,56,223,215 - .byte 0xf3,0xc3 - -.globl _aesni_cbc_encrypt -.private_extern _aesni_cbc_encrypt - -.p2align 4 -_aesni_cbc_encrypt: testq %rdx,%rdx jz L$cbc_ret @@ -3383,12 +1490,12 @@ L$cbc_enc_loop: xorps %xmm0,%xmm3 leaq 32(%rcx),%rcx xorps %xmm3,%xmm2 -L$oop_enc1_15: +L$oop_enc1_6: .byte 102,15,56,220,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_15 + jnz L$oop_enc1_6 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -3434,12 +1541,12 @@ L$cbc_decrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_16: +L$oop_dec1_7: .byte 102,15,56,222,209 decl %r10d movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 + jnz L$oop_dec1_7 .byte 102,15,56,223,209 pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 @@ -3452,7 +1559,9 @@ L$oop_dec1_16: .p2align 4 L$cbc_decrypt_bulk: leaq (%rsp),%r11 + pushq %rbp + subq $16,%rsp andq $-16,%rsp movq %rcx,%rbp @@ -3850,12 +1959,12 @@ L$cbc_dec_one: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_17: +L$oop_dec1_8: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_17 + jnz L$oop_dec1_8 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -3937,16 +2046,21 @@ L$cbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 movq -8(%r11),%rbp + leaq (%r11),%rsp + L$cbc_ret: .byte 0xf3,0xc3 -.globl _aesni_set_decrypt_key -.private_extern _aesni_set_decrypt_key + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key .p2align 4 -_aesni_set_decrypt_key: +_aes_hw_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -3979,16 +2093,23 @@ L$dec_key_inverse: pxor %xmm0,%xmm0 L$dec_key_ret: addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_decrypt_key: -.globl _aesni_set_encrypt_key -.private_extern _aesni_set_encrypt_key +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key .p2align 4 -_aesni_set_encrypt_key: +_aes_hw_set_encrypt_key: __aesni_set_encrypt_key: + +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit+3(%rip) +#endif .byte 0x48,0x83,0xEC,0x08 + movq $-1,%rax testq %rdi,%rdi jz L$enc_key_ret @@ -4282,7 +2403,9 @@ L$enc_key_ret: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 addq $8,%rsp + .byte 0xf3,0xc3 + L$SEH_end_set_encrypt_key: .p2align 4 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S deleted file mode 100644 index 195abd3b5c..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ /dev/null @@ -1,2500 +0,0 @@ -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -.text - - - - - -.p2align 6 -_bsaes_encrypt8: - leaq L$BS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa 80(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp L$enc_sbox -.p2align 4 -L$enc_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -L$enc_sbox: - pxor %xmm5,%xmm4 - pxor %xmm0,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm1,%xmm5 - pxor %xmm15,%xmm4 - - pxor %xmm2,%xmm5 - pxor %xmm6,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm3,%xmm2 - pxor %xmm4,%xmm3 - pxor %xmm0,%xmm2 - - pxor %xmm6,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm6,%xmm10 - movdqa %xmm0,%xmm9 - movdqa %xmm4,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm11 - - pxor %xmm3,%xmm10 - pxor %xmm1,%xmm9 - pxor %xmm2,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm3,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm15,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm2,%xmm11 - pxor %xmm15,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm6,%xmm12 - movdqa %xmm4,%xmm11 - pxor %xmm0,%xmm12 - pxor %xmm5,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm1,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm3,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm0,%xmm13 - pand %xmm2,%xmm11 - movdqa %xmm6,%xmm14 - pand %xmm15,%xmm12 - pand %xmm4,%xmm13 - por %xmm5,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm5,%xmm11 - movdqa %xmm4,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm5,%xmm9 - pxor %xmm4,%xmm5 - pand %xmm14,%xmm4 - pand %xmm13,%xmm5 - pxor %xmm4,%xmm5 - pxor %xmm9,%xmm4 - pxor %xmm15,%xmm11 - pxor %xmm2,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm2,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm2 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm2,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm11,%xmm5 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm2 - - movdqa %xmm6,%xmm11 - movdqa %xmm0,%xmm7 - pxor %xmm3,%xmm11 - pxor %xmm1,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm3,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm1,%xmm3 - pand %xmm14,%xmm7 - pand %xmm12,%xmm1 - pand %xmm13,%xmm11 - pand %xmm8,%xmm3 - pxor %xmm11,%xmm7 - pxor %xmm1,%xmm3 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm1 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm6,%xmm10 - pxor %xmm0,%xmm6 - pand %xmm14,%xmm0 - pand %xmm13,%xmm6 - pxor %xmm0,%xmm6 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm6 - pxor %xmm11,%xmm3 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm1 - pxor %xmm15,%xmm6 - pxor %xmm5,%xmm0 - pxor %xmm6,%xmm3 - pxor %xmm15,%xmm5 - pxor %xmm0,%xmm15 - - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 - pxor %xmm2,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm4,%xmm3 - - pxor %xmm2,%xmm5 - decl %r10d - jl L$enc_done - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm3,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm5,%xmm10 - pxor %xmm9,%xmm3 - pshufd $0x93,%xmm2,%xmm11 - pxor %xmm10,%xmm5 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm2 - pshufd $0x93,%xmm1,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm1 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm2,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm5,%xmm11 - pshufd $0x4E,%xmm2,%xmm7 - pxor %xmm1,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm3,%xmm10 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm1,%xmm5 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm12,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm14,%xmm6 - pxor %xmm13,%xmm5 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm1 - movdqa %xmm8,%xmm4 - movdqa 48(%r11),%xmm7 - jnz L$enc_loop - movdqa 64(%r11),%xmm7 - jmp L$enc_loop -.p2align 4 -L$enc_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm2,%xmm10 - psrlq $1,%xmm2 - pxor %xmm4,%xmm1 - pxor %xmm6,%xmm2 - pand %xmm7,%xmm1 - pand %xmm7,%xmm2 - pxor %xmm1,%xmm4 - psllq $1,%xmm1 - pxor %xmm2,%xmm6 - psllq $1,%xmm2 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm2 - movdqa %xmm3,%xmm9 - psrlq $1,%xmm3 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm5,%xmm3 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm3 - pand %xmm7,%xmm15 - pxor %xmm3,%xmm5 - psllq $1,%xmm3 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm2,%xmm10 - psrlq $2,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm1,%xmm2 - pand %xmm8,%xmm6 - pand %xmm8,%xmm2 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm2,%xmm1 - psllq $2,%xmm2 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm2 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm5,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm5 - psllq $2,%xmm0 - pxor %xmm15,%xmm3 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm5,%xmm9 - psrlq $4,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $4,%xmm3 - pxor %xmm4,%xmm5 - pxor %xmm1,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm4 - psllq $4,%xmm5 - pxor %xmm3,%xmm1 - psllq $4,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm2,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm2 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 - - - -.p2align 6 -_bsaes_decrypt8: - leaq L$BS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa -48(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp L$dec_sbox -.p2align 4 -L$dec_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -L$dec_sbox: - pxor %xmm3,%xmm2 - - pxor %xmm6,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm3,%xmm5 - pxor %xmm5,%xmm6 - pxor %xmm6,%xmm0 - - pxor %xmm0,%xmm15 - pxor %xmm4,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm15,%xmm4 - pxor %xmm2,%xmm0 - movdqa %xmm2,%xmm10 - movdqa %xmm6,%xmm9 - movdqa %xmm0,%xmm8 - movdqa %xmm3,%xmm12 - movdqa %xmm4,%xmm11 - - pxor %xmm15,%xmm10 - pxor %xmm3,%xmm9 - pxor %xmm5,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm15,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm1,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm5,%xmm11 - pxor %xmm1,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm2,%xmm12 - movdqa %xmm0,%xmm11 - pxor %xmm6,%xmm12 - pxor %xmm4,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm3,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm15,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm6,%xmm13 - pand %xmm5,%xmm11 - movdqa %xmm2,%xmm14 - pand %xmm1,%xmm12 - pand %xmm0,%xmm13 - por %xmm4,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm4,%xmm11 - movdqa %xmm0,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm4,%xmm9 - pxor %xmm0,%xmm4 - pand %xmm14,%xmm0 - pand %xmm13,%xmm4 - pxor %xmm0,%xmm4 - pxor %xmm9,%xmm0 - pxor %xmm1,%xmm11 - pxor %xmm5,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm1,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm5,%xmm1 - pand %xmm14,%xmm7 - pand %xmm12,%xmm5 - pand %xmm13,%xmm11 - pand %xmm8,%xmm1 - pxor %xmm11,%xmm7 - pxor %xmm5,%xmm1 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm5 - pxor %xmm11,%xmm4 - pxor %xmm11,%xmm1 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm5 - - movdqa %xmm2,%xmm11 - movdqa %xmm6,%xmm7 - pxor %xmm15,%xmm11 - pxor %xmm3,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm3,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm3 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm3,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm3 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm2,%xmm10 - pxor %xmm6,%xmm2 - pand %xmm14,%xmm6 - pand %xmm13,%xmm2 - pxor %xmm6,%xmm2 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm4,%xmm5 - - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm6,%xmm4 - pxor %xmm1,%xmm3 - pxor %xmm15,%xmm6 - pxor %xmm4,%xmm3 - pxor %xmm5,%xmm2 - pxor %xmm0,%xmm5 - pxor %xmm3,%xmm2 - - pxor %xmm15,%xmm3 - pxor %xmm2,%xmm6 - decl %r10d - jl L$dec_done - - pshufd $0x4E,%xmm15,%xmm7 - pshufd $0x4E,%xmm2,%xmm13 - pxor %xmm15,%xmm7 - pshufd $0x4E,%xmm4,%xmm14 - pxor %xmm2,%xmm13 - pshufd $0x4E,%xmm0,%xmm8 - pxor %xmm4,%xmm14 - pshufd $0x4E,%xmm5,%xmm9 - pxor %xmm0,%xmm8 - pshufd $0x4E,%xmm3,%xmm10 - pxor %xmm5,%xmm9 - pxor %xmm13,%xmm15 - pxor %xmm13,%xmm0 - pshufd $0x4E,%xmm1,%xmm11 - pxor %xmm3,%xmm10 - pxor %xmm7,%xmm5 - pxor %xmm8,%xmm3 - pshufd $0x4E,%xmm6,%xmm12 - pxor %xmm1,%xmm11 - pxor %xmm14,%xmm0 - pxor %xmm9,%xmm1 - pxor %xmm6,%xmm12 - - pxor %xmm14,%xmm5 - pxor %xmm13,%xmm3 - pxor %xmm13,%xmm1 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm14,%xmm1 - pxor %xmm14,%xmm6 - pxor %xmm12,%xmm4 - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm5,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm3,%xmm10 - pxor %xmm9,%xmm5 - pshufd $0x93,%xmm1,%xmm11 - pxor %xmm10,%xmm3 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm1 - pshufd $0x93,%xmm2,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm2 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm1,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm3,%xmm11 - pshufd $0x4E,%xmm1,%xmm7 - pxor %xmm2,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm5,%xmm10 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm2,%xmm3 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm12,%xmm8 - pxor %xmm1,%xmm10 - pxor %xmm14,%xmm6 - pxor %xmm3,%xmm13 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm2 - movdqa %xmm13,%xmm5 - movdqa %xmm8,%xmm4 - movdqa %xmm2,%xmm1 - movdqa %xmm10,%xmm2 - movdqa -16(%r11),%xmm7 - jnz L$dec_loop - movdqa -32(%r11),%xmm7 - jmp L$dec_loop -.p2align 4 -L$dec_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm2,%xmm9 - psrlq $1,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $1,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm6,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm4 - psllq $1,%xmm2 - pxor %xmm1,%xmm6 - psllq $1,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm3,%xmm5 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm5 - pand %xmm7,%xmm15 - pxor %xmm5,%xmm3 - psllq $1,%xmm5 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm1,%xmm10 - psrlq $2,%xmm1 - pxor %xmm4,%xmm6 - pxor %xmm2,%xmm1 - pand %xmm8,%xmm6 - pand %xmm8,%xmm1 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm1,%xmm2 - psllq $2,%xmm1 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm3 - psllq $2,%xmm0 - pxor %xmm15,%xmm5 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm3,%xmm9 - psrlq $4,%xmm3 - movdqa %xmm5,%xmm10 - psrlq $4,%xmm5 - pxor %xmm4,%xmm3 - pxor %xmm2,%xmm5 - pand %xmm7,%xmm3 - pand %xmm7,%xmm5 - pxor %xmm3,%xmm4 - psllq $4,%xmm3 - pxor %xmm5,%xmm2 - psllq $4,%xmm5 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm5 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm1 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 - - -.p2align 4 -_bsaes_key_convert: - leaq L$masks(%rip),%r11 - movdqu (%rcx),%xmm7 - leaq 16(%rcx),%rcx - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - movdqa 48(%r11),%xmm3 - movdqa 64(%r11),%xmm4 - pcmpeqd %xmm5,%xmm5 - - movdqu (%rcx),%xmm6 - movdqa %xmm7,(%rax) - leaq 16(%rax),%rax - decl %r10d - jmp L$key_loop -.p2align 4 -L$key_loop: -.byte 102,15,56,0,244 - - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - - pand %xmm6,%xmm8 - pand %xmm6,%xmm9 - movdqa %xmm2,%xmm10 - pcmpeqb %xmm0,%xmm8 - psllq $4,%xmm0 - movdqa %xmm3,%xmm11 - pcmpeqb %xmm1,%xmm9 - psllq $4,%xmm1 - - pand %xmm6,%xmm10 - pand %xmm6,%xmm11 - movdqa %xmm0,%xmm12 - pcmpeqb %xmm2,%xmm10 - psllq $4,%xmm2 - movdqa %xmm1,%xmm13 - pcmpeqb %xmm3,%xmm11 - psllq $4,%xmm3 - - movdqa %xmm2,%xmm14 - movdqa %xmm3,%xmm15 - pxor %xmm5,%xmm8 - pxor %xmm5,%xmm9 - - pand %xmm6,%xmm12 - pand %xmm6,%xmm13 - movdqa %xmm8,0(%rax) - pcmpeqb %xmm0,%xmm12 - psrlq $4,%xmm0 - movdqa %xmm9,16(%rax) - pcmpeqb %xmm1,%xmm13 - psrlq $4,%xmm1 - leaq 16(%rcx),%rcx - - pand %xmm6,%xmm14 - pand %xmm6,%xmm15 - movdqa %xmm10,32(%rax) - pcmpeqb %xmm2,%xmm14 - psrlq $4,%xmm2 - movdqa %xmm11,48(%rax) - pcmpeqb %xmm3,%xmm15 - psrlq $4,%xmm3 - movdqu (%rcx),%xmm6 - - pxor %xmm5,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm12,64(%rax) - movdqa %xmm13,80(%rax) - movdqa %xmm14,96(%rax) - movdqa %xmm15,112(%rax) - leaq 128(%rax),%rax - decl %r10d - jnz L$key_loop - - movdqa 80(%r11),%xmm7 - - .byte 0xf3,0xc3 - - -.globl _bsaes_cbc_encrypt -.private_extern _bsaes_cbc_encrypt - -.p2align 4 -_bsaes_cbc_encrypt: - cmpl $0,%r9d - jne _asm_AES_cbc_encrypt - cmpq $128,%rdx - jb _asm_AES_cbc_encrypt - - movq %rsp,%rax -L$cbc_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movq %r8,%rbx - shrq $4,%r14 - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - movdqu (%rbx),%xmm14 - subq $8,%r14 -L$cbc_dec_loop: - movdqu 0(%r12),%xmm15 - movdqu 16(%r12),%xmm0 - movdqu 32(%r12),%xmm1 - movdqu 48(%r12),%xmm2 - movdqu 64(%r12),%xmm3 - movdqu 80(%r12),%xmm4 - movq %rsp,%rax - movdqu 96(%r12),%xmm5 - movl %edx,%r10d - movdqu 112(%r12),%xmm6 - movdqa %xmm14,32(%rbp) - - call _bsaes_decrypt8 - - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm2 - movdqu 112(%r12),%xmm14 - pxor %xmm13,%xmm4 - movdqu %xmm15,0(%r13) - leaq 128(%r12),%r12 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - subq $8,%r14 - jnc L$cbc_dec_loop - - addq $8,%r14 - jz L$cbc_dec_done - - movdqu 0(%r12),%xmm15 - movq %rsp,%rax - movl %edx,%r10d - cmpq $2,%r14 - jb L$cbc_dec_one - movdqu 16(%r12),%xmm0 - je L$cbc_dec_two - movdqu 32(%r12),%xmm1 - cmpq $4,%r14 - jb L$cbc_dec_three - movdqu 48(%r12),%xmm2 - je L$cbc_dec_four - movdqu 64(%r12),%xmm3 - cmpq $6,%r14 - jb L$cbc_dec_five - movdqu 80(%r12),%xmm4 - je L$cbc_dec_six - movdqu 96(%r12),%xmm5 - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm14 - pxor %xmm12,%xmm2 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_six: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm14 - pxor %xmm11,%xmm6 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_five: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm14 - pxor %xmm10,%xmm1 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_four: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm14 - pxor %xmm9,%xmm3 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_three: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm14 - pxor %xmm8,%xmm5 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_two: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm14 - pxor %xmm7,%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_one: - leaq (%r12),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm14 - movdqu %xmm14,(%r13) - movdqa %xmm15,%xmm14 - -L$cbc_dec_done: - movdqu %xmm14,(%rbx) - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$cbc_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$cbc_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$cbc_dec_epilogue: - .byte 0xf3,0xc3 - - -.globl _bsaes_ctr32_encrypt_blocks -.private_extern _bsaes_ctr32_encrypt_blocks - -.p2align 4 -_bsaes_ctr32_encrypt_blocks: - movq %rsp,%rax -L$ctr_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movdqu (%r8),%xmm0 - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movdqa %xmm0,32(%rbp) - cmpq $8,%rdx - jb L$ctr_enc_short - - movl %eax,%ebx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %ebx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - movdqa (%rsp),%xmm8 - leaq L$ADD1(%rip),%r11 - movdqa 32(%rbp),%xmm15 - movdqa -32(%r11),%xmm7 -.byte 102,68,15,56,0,199 -.byte 102,68,15,56,0,255 - movdqa %xmm8,(%rsp) - jmp L$ctr_enc_loop -.p2align 4 -L$ctr_enc_loop: - movdqa %xmm15,32(%rbp) - movdqa %xmm15,%xmm0 - movdqa %xmm15,%xmm1 - paddd 0(%r11),%xmm0 - movdqa %xmm15,%xmm2 - paddd 16(%r11),%xmm1 - movdqa %xmm15,%xmm3 - paddd 32(%r11),%xmm2 - movdqa %xmm15,%xmm4 - paddd 48(%r11),%xmm3 - movdqa %xmm15,%xmm5 - paddd 64(%r11),%xmm4 - movdqa %xmm15,%xmm6 - paddd 80(%r11),%xmm5 - paddd 96(%r11),%xmm6 - - - - movdqa (%rsp),%xmm8 - leaq 16(%rsp),%rax - movdqa -16(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq L$BS0(%rip),%r11 - movl %ebx,%r10d - - call _bsaes_encrypt8_bitslice - - subq $8,%r14 - jc L$ctr_enc_loop_done - - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - movdqu 32(%r12),%xmm9 - movdqu 48(%r12),%xmm10 - movdqu 64(%r12),%xmm11 - movdqu 80(%r12),%xmm12 - movdqu 96(%r12),%xmm13 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - pxor %xmm15,%xmm7 - movdqa 32(%rbp),%xmm15 - pxor %xmm8,%xmm0 - movdqu %xmm7,0(%r13) - pxor %xmm9,%xmm3 - movdqu %xmm0,16(%r13) - pxor %xmm10,%xmm5 - movdqu %xmm3,32(%r13) - pxor %xmm11,%xmm2 - movdqu %xmm5,48(%r13) - pxor %xmm12,%xmm6 - movdqu %xmm2,64(%r13) - pxor %xmm13,%xmm1 - movdqu %xmm6,80(%r13) - pxor %xmm14,%xmm4 - movdqu %xmm1,96(%r13) - leaq L$ADD1(%rip),%r11 - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - paddd 112(%r11),%xmm15 - jnz L$ctr_enc_loop - - jmp L$ctr_enc_done -.p2align 4 -L$ctr_enc_loop_done: - addq $8,%r14 - movdqu 0(%r12),%xmm7 - pxor %xmm7,%xmm15 - movdqu %xmm15,0(%r13) - cmpq $2,%r14 - jb L$ctr_enc_done - movdqu 16(%r12),%xmm8 - pxor %xmm8,%xmm0 - movdqu %xmm0,16(%r13) - je L$ctr_enc_done - movdqu 32(%r12),%xmm9 - pxor %xmm9,%xmm3 - movdqu %xmm3,32(%r13) - cmpq $4,%r14 - jb L$ctr_enc_done - movdqu 48(%r12),%xmm10 - pxor %xmm10,%xmm5 - movdqu %xmm5,48(%r13) - je L$ctr_enc_done - movdqu 64(%r12),%xmm11 - pxor %xmm11,%xmm2 - movdqu %xmm2,64(%r13) - cmpq $6,%r14 - jb L$ctr_enc_done - movdqu 80(%r12),%xmm12 - pxor %xmm12,%xmm6 - movdqu %xmm6,80(%r13) - je L$ctr_enc_done - movdqu 96(%r12),%xmm13 - pxor %xmm13,%xmm1 - movdqu %xmm1,96(%r13) - jmp L$ctr_enc_done - -.p2align 4 -L$ctr_enc_short: - leaq 32(%rbp),%rdi - leaq 48(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_encrypt - movdqu (%r12),%xmm0 - leaq 16(%r12),%r12 - movl 44(%rbp),%eax - bswapl %eax - pxor 48(%rbp),%xmm0 - incl %eax - movdqu %xmm0,(%r13) - bswapl %eax - leaq 16(%r13),%r13 - movl %eax,44(%rsp) - decq %r14 - jnz L$ctr_enc_short - -L$ctr_enc_done: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$ctr_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$ctr_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$ctr_enc_epilogue: - .byte 0xf3,0xc3 - -.globl _bsaes_xts_encrypt -.private_extern _bsaes_xts_encrypt - -.p2align 4 -_bsaes_xts_encrypt: - movq %rsp,%rax -L$xts_enc_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call _asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - andq $-16,%r14 - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc L$xts_enc_short - jmp L$xts_enc_loop - -.p2align 4 -L$xts_enc_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm1,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc L$xts_enc_loop - -L$xts_enc_short: - addq $0x80,%r14 - jz L$xts_enc_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je L$xts_enc_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je L$xts_enc_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je L$xts_enc_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je L$xts_enc_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je L$xts_enc_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je L$xts_enc_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - movdqu %xmm1,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - movdqu %xmm2,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - movdqu %xmm5,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - movdqu %xmm3,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_encrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -L$xts_enc_done: - andl $15,%ebx - jz L$xts_enc_ret - movq %r13,%rdx - -L$xts_enc_steal: - movzbl (%r12),%eax - movzbl -16(%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,-16(%rdx) - movb %cl,0(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz L$xts_enc_steal - - movdqu -16(%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _asm_AES_encrypt - pxor 32(%rbp),%xmm6 - movdqu %xmm6,-16(%r13) - -L$xts_enc_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$xts_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$xts_enc_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$xts_enc_epilogue: - .byte 0xf3,0xc3 - - -.globl _bsaes_xts_decrypt -.private_extern _bsaes_xts_decrypt - -.p2align 4 -_bsaes_xts_decrypt: - movq %rsp,%rax -L$xts_dec_prologue: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -72(%rsp),%rsp - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call _asm_AES_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - xorl %eax,%eax - andq $-16,%r14 - testl $15,%ebx - setnz %al - shlq $4,%rax - subq %rax,%r14 - - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc L$xts_dec_short - jmp L$xts_dec_loop - -.p2align 4 -L$xts_dec_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc L$xts_dec_loop - -L$xts_dec_short: - addq $0x80,%r14 - jz L$xts_dec_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je L$xts_dec_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je L$xts_dec_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je L$xts_dec_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je L$xts_dec_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je L$xts_dec_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je L$xts_dec_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -L$xts_dec_done: - andl $15,%ebx - jz L$xts_dec_ret - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - movdqa %xmm6,%xmm5 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - movdqu (%r12),%xmm15 - pxor %xmm13,%xmm6 - - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm6 - movq %r13,%rdx - movdqu %xmm6,(%r13) - -L$xts_dec_steal: - movzbl 16(%r12),%eax - movzbl (%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,(%rdx) - movb %cl,16(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz L$xts_dec_steal - - movdqu (%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm5,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _asm_AES_decrypt - pxor 32(%rbp),%xmm5 - movdqu %xmm5,(%r13) - -L$xts_dec_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$xts_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$xts_dec_bzero - - leaq 120(%rbp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbx - movq -8(%rax),%rbp - leaq (%rax),%rsp -L$xts_dec_epilogue: - .byte 0xf3,0xc3 - - -.p2align 6 -_bsaes_const: -L$M0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -L$ISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -L$ISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -L$BS0: -.quad 0x5555555555555555, 0x5555555555555555 -L$BS1: -.quad 0x3333333333333333, 0x3333333333333333 -L$BS2: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -L$SR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -L$SRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -L$M0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -L$SWPUP: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -L$SWPUPM0SR: -.quad 0x0a0d02060c03070b, 0x0004080f05090e01 -L$ADD1: -.quad 0x0000000000000000, 0x0000000100000000 -L$ADD2: -.quad 0x0000000000000000, 0x0000000200000000 -L$ADD3: -.quad 0x0000000000000000, 0x0000000300000000 -L$ADD4: -.quad 0x0000000000000000, 0x0000000400000000 -L$ADD5: -.quad 0x0000000000000000, 0x0000000500000000 -L$ADD6: -.quad 0x0000000000000000, 0x0000000600000000 -L$ADD7: -.quad 0x0000000000000000, 0x0000000700000000 -L$ADD8: -.quad 0x0000000000000000, 0x0000000800000000 -L$xts_magic: -.long 0x87,0,1,0 -L$masks: -.quad 0x0101010101010101, 0x0101010101010101 -.quad 0x0202020202020202, 0x0202020202020202 -.quad 0x0404040404040404, 0x0404040404040404 -.quad 0x0808080808080808, 0x0808080808080808 -L$M0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -L$63: -.quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 -.p2align 6 - -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S new file mode 100644 index 0000000000..1b9129f2dd --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S @@ -0,0 +1,426 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + + +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.p2align 4 +_gcm_gmult_ssse3: + +L$gmult_seh_begin: + movdqu (%rdi),%xmm0 + movdqa L$reverse_bytes(%rip),%xmm10 + movdqa L$low4_mask(%rip),%xmm2 + + +.byte 102,65,15,56,0,194 + + + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + + + + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_1: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_1 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_2: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_2 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +L$oop_row_3: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_3 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + +.byte 102,65,15,56,0,210 + movdqu %xmm2,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +L$gmult_seh_end: + + + + + + + + +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.p2align 4 +_gcm_ghash_ssse3: +L$ghash_seh_begin: + + movdqu (%rdi),%xmm0 + movdqa L$reverse_bytes(%rip),%xmm10 + movdqa L$low4_mask(%rip),%xmm11 + + + andq $-16,%rcx + + + +.byte 102,65,15,56,0,194 + + + pxor %xmm3,%xmm3 +L$oop_ghash: + + movdqu (%rdx),%xmm1 +.byte 102,65,15,56,0,202 + pxor %xmm1,%xmm0 + + + movdqa %xmm11,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm11,%xmm0 + + + + + pxor %xmm2,%xmm2 + + movq $5,%rax +L$oop_row_4: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_4 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_5: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_5 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +L$oop_row_6: + movdqa (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_6 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + + + leaq -256(%rsi),%rsi + + + leaq 16(%rdx),%rdx + subq $16,%rcx + jnz L$oop_ghash + + +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 +L$ghash_seh_end: + + + +.p2align 4 + + +L$reverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +L$low4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S index 78b88cc28d..d7dcf5d61f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,13 +19,21 @@ .p2align 4 _gcm_gmult_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + L$gmult_prologue: movzbq 15(%rdi),%r8 @@ -91,23 +111,35 @@ L$break1: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$gmult_epilogue: .byte 0xf3,0xc3 + .globl _gcm_ghash_4bit .private_extern _gcm_ghash_4bit .p2align 4 _gcm_ghash_4bit: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp + L$ghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -653,21 +685,31 @@ L$outer_loop: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq 0(%rsi),%rsp + L$ghash_epilogue: .byte 0xf3,0xc3 + .globl _gcm_init_clmul .private_extern _gcm_init_clmul .p2align 4 _gcm_init_clmul: + L$_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -820,11 +862,13 @@ L$_init_clmul: movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 + .globl _gcm_gmult_clmul .private_extern _gcm_gmult_clmul .p2align 4 _gcm_gmult_clmul: + L$_gmult_clmul: movdqu (%rdi),%xmm0 movdqa L$bswap_mask(%rip),%xmm5 @@ -872,11 +916,13 @@ L$_gmult_clmul: movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 + .globl _gcm_ghash_clmul .private_extern _gcm_ghash_clmul .p2align 5 _gcm_ghash_clmul: + L$_ghash_clmul: movdqa L$bswap_mask(%rip),%xmm10 @@ -1257,11 +1303,13 @@ L$done: movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 + .globl _gcm_init_avx .private_extern _gcm_init_avx .p2align 5 _gcm_init_avx: + vzeroupper vmovdqu (%rsi),%xmm2 @@ -1365,18 +1413,22 @@ L$init_start_avx: vzeroupper .byte 0xf3,0xc3 + .globl _gcm_gmult_avx .private_extern _gcm_gmult_avx .p2align 5 _gcm_gmult_avx: + jmp L$_gmult_clmul + .globl _gcm_ghash_avx .private_extern _gcm_ghash_avx .p2align 5 _gcm_ghash_avx: + vzeroupper vmovdqu (%rdi),%xmm10 @@ -1749,6 +1801,7 @@ L$tail_no_xor_avx: vzeroupper .byte 0xf3,0xc3 + .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S index 776c116046..cfb4180da3 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .p2align 4 @@ -6,11 +18,17 @@ .private_extern _md5_block_asm_data_order _md5_block_asm_data_order: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r14 + pushq %r15 + L$prologue: @@ -660,12 +678,19 @@ L$end: movl %edx,12(%rbp) movq (%rsp),%r15 + movq 8(%rsp),%r14 + movq 16(%rsp),%r12 + movq 24(%rsp),%rbx + movq 32(%rsp),%rbp + addq $40,%rsp + L$epilogue: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index f7875772ad..1f4a93132f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -17,15 +29,25 @@ L$ONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe +L$ord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +L$ordK: +.quad 0xccd1c8aaee00bc4f + + .globl _ecp_nistz256_neg .private_extern _ecp_nistz256_neg .p2align 5 _ecp_nistz256_neg: + pushq %r12 + pushq %r13 +L$neg_body: + xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 @@ -58,8 +80,13 @@ _ecp_nistz256_neg: movq %r10,16(%rdi) movq %r11,24(%rdi) - popq %r13 - popq %r12 + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$neg_epilogue: .byte 0xf3,0xc3 @@ -68,18 +95,1101 @@ _ecp_nistz256_neg: + +.globl _ecp_nistz256_ord_mul_mont +.private_extern _ecp_nistz256_ord_mul_mont + +.p2align 5 +_ecp_nistz256_ord_mul_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$ecp_nistz256_ord_mul_montx + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq L$ord(%rip),%r14 + movq L$ordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mul_epilogue: + .byte 0xf3,0xc3 + + + + + + + + + +.globl _ecp_nistz256_ord_sqr_mont +.private_extern _ecp_nistz256_ord_sqr_mont + +.p2align 5 +_ecp_nistz256_ord_sqr_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$ecp_nistz256_ord_sqr_montx + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq L$ord(%rip),%rsi + movq %rdx,%rbx + jmp L$oop_ord_sqr + +.p2align 5 +L$oop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz L$oop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqr_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +ecp_nistz256_ord_mul_montx: + +L$ecp_nistz256_ord_mul_montx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq L$ord-128(%rip),%r14 + movq L$ordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mulx_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +ecp_nistz256_ord_sqr_montx: + +L$ecp_nistz256_ord_sqr_montx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq L$ord(%rip),%rsi + jmp L$oop_ord_sqrx + +.p2align 5 +L$oop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz L$oop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqrx_epilogue: + .byte 0xf3,0xc3 + + + + + + + + .globl _ecp_nistz256_mul_mont .private_extern _ecp_nistz256_mul_mont .p2align 5 _ecp_nistz256_mul_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx L$mul_mont: pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +L$mul_body: + cmpl $0x80100,%ecx + je L$mul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -88,21 +1198,45 @@ L$mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp L$mul_mont_done + +.p2align 5 +L$mul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx L$mul_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_mul_montq: + movq %rax,%rbp mulq %r9 movq L$poly+8(%rip),%r14 @@ -321,36 +1455,72 @@ __ecp_nistz256_mul_montq: + .globl _ecp_nistz256_sqr_mont .private_extern _ecp_nistz256_sqr_mont .p2align 5 _ecp_nistz256_sqr_mont: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +L$sqr_body: + cmpl $0x80100,%ecx + je L$sqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp L$sqr_mont_done + +.p2align 5 +L$sqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx L$sqr_mont_done: - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqr_epilogue: .byte 0xf3,0xc3 + .p2align 5 __ecp_nistz256_sqr_montq: + movq %rax,%r13 mulq %r14 movq %rax,%r9 @@ -511,11 +1681,311 @@ __ecp_nistz256_sqr_montq: +.p2align 5 +__ecp_nistz256_mul_montx: + + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq L$poly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq L$poly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_sqr_montx: + + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq L$poly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq L$poly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 + + + + .globl _ecp_nistz256_select_w5 .private_extern _ecp_nistz256_select_w5 .p2align 5 _ecp_nistz256_select_w5: + leaq _OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -572,6 +2042,8 @@ L$select_loop_sse_w5: movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_select_w5: + @@ -580,6 +2052,7 @@ L$select_loop_sse_w5: .p2align 5 _ecp_nistz256_select_w7: + leaq _OPENSSL_ia32cap_P(%rip),%rax movq 8(%rax),%rax testl $32,%eax @@ -625,11 +2098,14 @@ L$select_loop_sse_w7: movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_select_w7: + .p2align 5 ecp_nistz256_avx2_select_w5: + L$avx2_select_w5: vzeroupper vmovdqa L$Two(%rip),%ymm0 @@ -685,6 +2161,8 @@ L$select_loop_avx2_w5: vzeroupper .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_avx2_select_w5: + @@ -693,6 +2171,7 @@ L$select_loop_avx2_w5: .p2align 5 _ecp_nistz256_avx2_select_w7: + L$avx2_select_w7: vzeroupper vmovdqa L$Three(%rip),%ymm0 @@ -763,9 +2242,12 @@ L$select_loop_avx2_w7: vzeroupper .byte 0xf3,0xc3 +L$SEH_end_ecp_nistz256_avx2_select_w7: + .p2align 5 __ecp_nistz256_add_toq: + xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 @@ -796,8 +2278,10 @@ __ecp_nistz256_add_toq: + .p2align 5 __ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax @@ -827,8 +2311,10 @@ __ecp_nistz256_sub_fromq: + .p2align 5 __ecp_nistz256_subq: + subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 @@ -854,8 +2340,10 @@ __ecp_nistz256_subq: + .p2align 5 __ecp_nistz256_mul_by_2q: + xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 @@ -884,19 +2372,34 @@ __ecp_nistz256_mul_by_2q: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_double .private_extern _ecp_nistz256_point_double .p2align 5 _ecp_nistz256_point_double: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$point_doublex pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp +L$point_doubleq_body: + L$point_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx @@ -1078,28 +2581,53 @@ L$point_double_shortcutq: .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doubleq_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_add .private_extern _ecp_nistz256_point_add .p2align 5 _ecp_nistz256_point_add: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$point_addx pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp +L$point_addq_body: + movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -1244,15 +2772,22 @@ _ecp_nistz256_point_add: orq %r8,%r12 orq %r9,%r12 -.byte 0x3e - jnz L$add_proceedq .byte 102,73,15,126,208 .byte 102,73,15,126,217 - testq %r8,%r8 + orq %r8,%r12 +.byte 0x3e jnz L$add_proceedq + + + testq %r9,%r9 jz L$add_doubleq + + + + + .byte 102,72,15,126,199 pxor %xmm0,%xmm0 movdqu %xmm0,0(%rdi) @@ -1268,8 +2803,10 @@ L$add_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp + jmp L$point_double_shortcutq + .p2align 5 L$add_proceedq: movq 0+64(%rsp),%rax @@ -1475,28 +3012,53 @@ L$add_proceedq: movdqu %xmm3,48(%rdi) L$add_doneq: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addq_epilogue: .byte 0xf3,0xc3 + .globl _ecp_nistz256_point_add_affine .private_extern _ecp_nistz256_point_add_affine .p2align 5 _ecp_nistz256_point_add_affine: + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$point_add_affinex pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp +L$add_affineq_body: + movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 @@ -1778,13 +3340,1128 @@ _ecp_nistz256_point_add_affine: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affineq_epilogue: .byte 0xf3,0xc3 + + +.p2align 5 +__ecp_nistz256_add_tox: + + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_sub_fromx: + + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_subx: + + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + .byte 0xf3,0xc3 + + + + +.p2align 5 +__ecp_nistz256_mul_by_2x: + + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_point_doublex: + +L$point_doublex: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $160+8,%rsp + +L$point_doublex_body: + +L$point_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doublex_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_point_addx: + +L$point_addx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $576+8,%rsp + +L$point_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz L$add_proceedx + + + + testq %r9,%r9 + jz L$add_doublex + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_donex + +.p2align 5 +L$add_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + + jmp L$point_double_shortcutx + + +.p2align 5 +L$add_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_donex: + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addx_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_point_add_affinex: + +L$point_add_affinex: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $480+8,%rsp + +L$add_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affinex_epilogue: + .byte 0xf3,0xc3 + + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S new file mode 100644 index 0000000000..66fcfa3305 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S @@ -0,0 +1,328 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +.private_extern _beeu_mod_inverse_vartime +.globl _beeu_mod_inverse_vartime +.private_extern _beeu_mod_inverse_vartime +.p2align 5 +_beeu_mod_inverse_vartime: + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rbx + + pushq %rsi + + + subq $80,%rsp + + movq %rdi,0(%rsp) + + + movq $1,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %rdi,%rdi + + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + xorq %rbp,%rbp + + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu %xmm0,48(%rsp) + vmovdqu %xmm1,64(%rsp) + + vmovdqu 0(%rdx),%xmm0 + vmovdqu 16(%rdx),%xmm1 + vmovdqu %xmm0,16(%rsp) + vmovdqu %xmm1,32(%rsp) + +L$beeu_loop: + xorq %rbx,%rbx + orq 48(%rsp),%rbx + orq 56(%rsp),%rbx + orq 64(%rsp),%rbx + orq 72(%rsp),%rbx + jz L$beeu_loop_end + + + + + + + + + + + movq $1,%rcx + + +L$beeu_shift_loop_XB: + movq %rcx,%rbx + andq 48(%rsp),%rbx + jnz L$beeu_shift_loop_end_XB + + + movq $1,%rbx + andq %r8,%rbx + jz L$shift1_0 + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq $0,%rdi + +L$shift1_0: + shrdq $1,%r9,%r8 + shrdq $1,%r10,%r9 + shrdq $1,%r11,%r10 + shrdq $1,%rdi,%r11 + shrq $1,%rdi + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne L$beeu_shift_loop_XB + +L$beeu_shift_loop_end_XB: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz L$beeu_no_shift_XB + + + + movq 8+48(%rsp),%rax + movq 16+48(%rsp),%rbx + movq 24+48(%rsp),%rsi + + shrdq %cl,%rax,0+48(%rsp) + shrdq %cl,%rbx,8+48(%rsp) + shrdq %cl,%rsi,16+48(%rsp) + + shrq %cl,%rsi + movq %rsi,24+48(%rsp) + + +L$beeu_no_shift_XB: + + movq $1,%rcx + + +L$beeu_shift_loop_YA: + movq %rcx,%rbx + andq 16(%rsp),%rbx + jnz L$beeu_shift_loop_end_YA + + + movq $1,%rbx + andq %r12,%rbx + jz L$shift1_1 + addq 0(%rdx),%r12 + adcq 8(%rdx),%r13 + adcq 16(%rdx),%r14 + adcq 24(%rdx),%r15 + adcq $0,%rbp + +L$shift1_1: + shrdq $1,%r13,%r12 + shrdq $1,%r14,%r13 + shrdq $1,%r15,%r14 + shrdq $1,%rbp,%r15 + shrq $1,%rbp + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne L$beeu_shift_loop_YA + +L$beeu_shift_loop_end_YA: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz L$beeu_no_shift_YA + + + + movq 8+16(%rsp),%rax + movq 16+16(%rsp),%rbx + movq 24+16(%rsp),%rsi + + shrdq %cl,%rax,0+16(%rsp) + shrdq %cl,%rbx,8+16(%rsp) + shrdq %cl,%rsi,16+16(%rsp) + + shrq %cl,%rsi + movq %rsi,24+16(%rsp) + + +L$beeu_no_shift_YA: + + movq 48(%rsp),%rax + movq 56(%rsp),%rbx + movq 64(%rsp),%rsi + movq 72(%rsp),%rcx + subq 16(%rsp),%rax + sbbq 24(%rsp),%rbx + sbbq 32(%rsp),%rsi + sbbq 40(%rsp),%rcx + jnc L$beeu_B_bigger_than_A + + + movq 16(%rsp),%rax + movq 24(%rsp),%rbx + movq 32(%rsp),%rsi + movq 40(%rsp),%rcx + subq 48(%rsp),%rax + sbbq 56(%rsp),%rbx + sbbq 64(%rsp),%rsi + sbbq 72(%rsp),%rcx + movq %rax,16(%rsp) + movq %rbx,24(%rsp) + movq %rsi,32(%rsp) + movq %rcx,40(%rsp) + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + adcq %rdi,%rbp + jmp L$beeu_loop + +L$beeu_B_bigger_than_A: + + movq %rax,48(%rsp) + movq %rbx,56(%rsp) + movq %rsi,64(%rsp) + movq %rcx,72(%rsp) + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rbp,%rdi + + jmp L$beeu_loop + +L$beeu_loop_end: + + + + + movq 16(%rsp),%rbx + subq $1,%rbx + orq 24(%rsp),%rbx + orq 32(%rsp),%rbx + orq 40(%rsp),%rbx + + jnz L$beeu_err + + + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + xorq %rdi,%rdi + +L$beeu_reduction_loop: + movq %r12,16(%rsp) + movq %r13,24(%rsp) + movq %r14,32(%rsp) + movq %r15,40(%rsp) + movq %rbp,48(%rsp) + + + subq %r8,%r12 + sbbq %r9,%r13 + sbbq %r10,%r14 + sbbq %r11,%r15 + sbbq $0,%rbp + + + cmovcq 16(%rsp),%r12 + cmovcq 24(%rsp),%r13 + cmovcq 32(%rsp),%r14 + cmovcq 40(%rsp),%r15 + jnc L$beeu_reduction_loop + + + subq %r12,%r8 + sbbq %r13,%r9 + sbbq %r14,%r10 + sbbq %r15,%r11 + +L$beeu_save: + + movq 0(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + + movq $1,%rax + jmp L$beeu_finish + +L$beeu_err: + + xorq %rax,%rax + +L$beeu_finish: + addq $80,%rsp + + popq %rsi + + popq %rbx + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbp + + .byte 0xf3,0xc3 + + + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S index b259286f6e..f6f2be7ae1 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -9,31 +21,31 @@ .p2align 4 _CRYPTO_rdrand: + xorq %rax,%rax - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,242 adcq %rax,%rax - movq %rcx,0(%rdi) + movq %rdx,0(%rdi) .byte 0xf3,0xc3 + + .globl _CRYPTO_rdrand_multiple8_buf .private_extern _CRYPTO_rdrand_multiple8_buf .p2align 4 _CRYPTO_rdrand_multiple8_buf: + testq %rsi,%rsi jz L$out movq $8,%rdx L$loop: - - -.byte 0x48, 0x0f, 0xc7, 0xf1 +.byte 72,15,199,241 jnc L$err movq %rcx,0(%rdi) addq %rdx,%rdi @@ -45,4 +57,6 @@ L$out: L$err: xorq %rax,%rax .byte 0xf3,0xc3 + + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S index 6eb7afc510..e9cae78c5d 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text .globl _rsaz_1024_sqr_avx2 @@ -77,7 +89,7 @@ L$sqr_1024_no_n_copy: vmovdqu 256-128(%rsi),%ymm8 leaq 192(%rsp),%rbx - vpbroadcastq L$and_mask(%rip),%ymm15 + vmovdqu L$and_mask(%rip),%ymm15 jmp L$OOP_GRANDE_SQR_1024 .p2align 5 @@ -829,10 +841,10 @@ L$oop_mul_1024: vpmuludq 192-128(%rcx),%ymm11,%ymm12 vpaddq %ymm12,%ymm6,%ymm6 vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 + vpblendd $3,%ymm14,%ymm9,%ymm12 vpaddq %ymm13,%ymm7,%ymm7 vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm12,%ymm3,%ymm3 vpaddq %ymm0,%ymm8,%ymm8 movq %rbx,%rax @@ -845,7 +857,9 @@ L$oop_mul_1024: vmovdqu -8+64-128(%rsi),%ymm13 movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 andl $0x1fffffff,%eax imulq 16-128(%rsi),%rbx @@ -1074,7 +1088,6 @@ L$oop_mul_1024: decl %r14d jnz L$oop_mul_1024 - vpermq $0,%ymm15,%ymm15 vpaddq (%rsp),%ymm12,%ymm0 vpsrlq $29,%ymm0,%ymm12 @@ -1215,6 +1228,7 @@ L$mul_1024_epilogue: .p2align 5 _rsaz_1024_red2norm_avx2: + subq $-128,%rsi xorq %rax,%rax movq -128(%rsi),%r8 @@ -1408,11 +1422,13 @@ _rsaz_1024_red2norm_avx2: .byte 0xf3,0xc3 + .globl _rsaz_1024_norm2red_avx2 .private_extern _rsaz_1024_norm2red_avx2 .p2align 5 _rsaz_1024_norm2red_avx2: + subq $-128,%rdi movq (%rsi),%r8 movl $0x1fffffff,%eax @@ -1566,11 +1582,13 @@ _rsaz_1024_norm2red_avx2: movq %r8,184(%rdi) .byte 0xf3,0xc3 + .globl _rsaz_1024_scatter5_avx2 .private_extern _rsaz_1024_scatter5_avx2 .p2align 5 _rsaz_1024_scatter5_avx2: + vzeroupper vmovdqu L$scatter_permd(%rip),%ymm5 shll $4,%edx @@ -1592,6 +1610,7 @@ L$oop_scatter_1024: .byte 0xf3,0xc3 + .globl _rsaz_1024_gather5_avx2 .private_extern _rsaz_1024_gather5_avx2 @@ -1714,22 +1733,9 @@ L$oop_gather_1024: L$SEH_end_rsaz_1024_gather5: - -.globl _rsaz_avx2_eligible -.private_extern _rsaz_avx2_eligible - -.p2align 5 -_rsaz_avx2_eligible: - leaq _OPENSSL_ia32cap_P(%rip),%rax - movl 8(%rax),%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 - - .p2align 6 L$and_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff L$scatter_permd: .long 0,2,4,6,7,7,7,7 L$gather_permd: diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S index c22431c89f..ace121e359 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,6 +19,7 @@ .p2align 4 _sha1_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r10 movl 0(%r10),%r9d movl 4(%r10),%r8d @@ -23,17 +36,24 @@ _sha1_block_data_order: .p2align 4 L$ialu: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) + L$prologue: movl 0(%r8),%esi @@ -1228,25 +1248,40 @@ L$loop: jnz L$loop movq 64(%rsp),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp andq $-64,%rsp movq %rdi,%r8 @@ -2403,24 +2438,38 @@ L$done_ssse3: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 4 sha1_block_data_order_avx: _avx_shortcut: + movq %rsp,%r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp vzeroupper andq $-64,%rsp @@ -3517,14 +3566,21 @@ L$done_avx: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S index ac6559e074..5e46e81c16 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,6 +19,7 @@ .p2align 4 _sha256_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -19,12 +32,19 @@ _sha256_block_data_order: testl $512,%r10d jnz L$ssse3_shortcut movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -32,7 +52,8 @@ _sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue: movl 0(%rdi),%eax @@ -1696,17 +1717,26 @@ L$rounds_16_xx: movl %r11d,28(%rdi) jb L$loop - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 K256: @@ -1753,14 +1783,22 @@ K256: .p2align 6 sha256_block_data_order_ssse3: + L$ssse3_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1768,7 +1806,8 @@ L$ssse3_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_ssse3: movl 0(%rdi),%eax @@ -2834,28 +2873,45 @@ L$ssse3_00_47: movl %r11d,28(%rdi) jb L$loop_ssse3 - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue_ssse3: .byte 0xf3,0xc3 + .p2align 6 sha256_block_data_order_avx: + L$avx_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2863,7 +2919,8 @@ L$avx_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) + L$prologue_avx: vzeroupper @@ -3890,16 +3947,25 @@ L$avx_00_47: movl %r11d,28(%rdi) jb L$loop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi + vzeroupper movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S index 0b738e6f45..c550e794ac 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -7,24 +19,30 @@ .p2align 4 _sha512_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $2048,%r10d - jnz L$xop_shortcut andl $1073741824,%r9d andl $268435968,%r10d orl %r9d,%r10d cmpl $1342177792,%r10d je L$avx_shortcut movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -32,7 +50,8 @@ _sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue: movq 0(%rdi),%rax @@ -1696,17 +1715,26 @@ L$rounds_16_xx: movq %r11,56(%rdi) jb L$loop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue: .byte 0xf3,0xc3 + .p2align 6 K512: @@ -1795,1099 +1823,24 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 -sha512_block_data_order_xop: -L$xop_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) -L$prologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop_xop -.p2align 4 -L$loop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp L$xop_00_47 - -.p2align 4 -L$xop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne L$xop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop_xop - - movq 128+24(%rsp),%rsi - vzeroupper - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$epilogue_xop: - .byte 0xf3,0xc3 - - .p2align 6 sha512_block_data_order_avx: + L$avx_shortcut: movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2895,7 +1848,8 @@ L$avx_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) + L$prologue_avx: vzeroupper @@ -4012,16 +2966,25 @@ L$avx_00_47: movq %r11,56(%rdi) jb L$loop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi + vzeroupper movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$epilogue_avx: .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 867df68b4b..cd52d67e60 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -19,6 +31,7 @@ .p2align 4 _vpaes_encrypt_core: + movq %rdx,%r9 movq $16,%r11 movl 240(%rdx),%eax @@ -107,8 +120,185 @@ L$enc_entry: + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core_2x: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa L$k_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc2x_entry + +.p2align 4 +L$enc2x_loop: + + movdqa L$k_sb1(%rip),%xmm4 + movdqa L$k_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa L$k_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa L$k_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +L$enc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz L$enc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + .byte 0xf3,0xc3 + + + + + + + + + .p2align 4 _vpaes_decrypt_core: + movq %rdx,%r9 movl 240(%rdx),%eax movdqa %xmm9,%xmm1 @@ -213,6 +403,7 @@ L$dec_entry: + .p2align 4 _vpaes_schedule_core: @@ -220,6 +411,7 @@ _vpaes_schedule_core: + call _vpaes_preheat movdqa L$k_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -398,8 +590,10 @@ L$schedule_mangle_last_dec: + .p2align 4 _vpaes_schedule_192_smear: + pshufd $0x80,%xmm6,%xmm1 pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -427,11 +621,13 @@ _vpaes_schedule_192_smear: + .p2align 4 _vpaes_schedule_round: + pxor %xmm1,%xmm1 .byte 102,65,15,58,15,200,15 .byte 102,69,15,58,15,192,15 @@ -496,8 +692,10 @@ _vpaes_schedule_low_round: + .p2align 4 _vpaes_schedule_transform: + movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 psrld $4,%xmm1 @@ -532,10 +730,12 @@ _vpaes_schedule_transform: + .p2align 4 _vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 movdqa L$k_mc_forward(%rip),%xmm5 testq %rcx,%rcx @@ -605,11 +805,18 @@ L$schedule_mangle_both: + .globl _vpaes_set_encrypt_key .private_extern _vpaes_set_encrypt_key .p2align 4 _vpaes_set_encrypt_key: + +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+5(%rip) +#endif + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -622,11 +829,13 @@ _vpaes_set_encrypt_key: .byte 0xf3,0xc3 + .globl _vpaes_set_decrypt_key .private_extern _vpaes_set_decrypt_key .p2align 4 _vpaes_set_decrypt_key: + movl %esi,%eax shrl $5,%eax addl $5,%eax @@ -644,11 +853,17 @@ _vpaes_set_decrypt_key: .byte 0xf3,0xc3 + .globl _vpaes_encrypt .private_extern _vpaes_encrypt .p2align 4 _vpaes_encrypt: + +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+4(%rip) +#endif movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_encrypt_core @@ -656,22 +871,26 @@ _vpaes_encrypt: .byte 0xf3,0xc3 + .globl _vpaes_decrypt .private_extern _vpaes_decrypt .p2align 4 _vpaes_decrypt: + movdqu (%rdi),%xmm0 call _vpaes_preheat call _vpaes_decrypt_core movdqu %xmm0,(%rsi) .byte 0xf3,0xc3 + .globl _vpaes_cbc_encrypt .private_extern _vpaes_cbc_encrypt .p2align 4 _vpaes_cbc_encrypt: + xchgq %rcx,%rdx subq $16,%rcx jc L$cbc_abort @@ -709,6 +928,70 @@ L$cbc_abort: .byte 0xf3,0xc3 +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.p2align 4 +_vpaes_ctr32_encrypt_blocks: + + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz L$ctr32_abort + movdqu (%r8),%xmm0 + movdqa L$ctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb L$rev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz L$ctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz L$ctr32_done + +L$ctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +L$ctr32_loop: + movdqa L$rev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa L$ctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz L$ctr32_loop + +L$ctr32_done: +L$ctr32_abort: + .byte 0xf3,0xc3 + + + @@ -717,6 +1000,7 @@ L$cbc_abort: .p2align 4 _vpaes_preheat: + leaq L$k_s0F(%rip),%r10 movdqa -32(%r10),%xmm10 movdqa -16(%r10),%xmm11 @@ -733,6 +1017,7 @@ _vpaes_preheat: + .p2align 6 _vpaes_consts: L$k_inv: @@ -828,6 +1113,17 @@ L$k_dsbe: L$k_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +L$rev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +L$ctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +L$ctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .p2align 6 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S index 4904417a20..8d6444cb6f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -16,6 +28,8 @@ _bn_mul_mont: jnz L$mul_enter cmpl $8,%r9d jb L$mul_enter + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d cmpq %rsi,%rdx jne L$mul4x_enter testl $7,%r9d @@ -207,31 +221,30 @@ L$inner_enter: xorq %r14,%r14 movq (%rsp),%rax - leaq (%rsp),%rsi movq %r9,%r15 - jmp L$sub + .p2align 4 -L$sub: - sbbq (%rcx,%r14,8),%rax +L$sub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax + movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz L$sub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.p2align 4 + L$copy: - movq (%rsi,%r14,8),%rax - movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy @@ -265,6 +278,9 @@ bn_mul4x_mont: movq %rsp,%rax L$mul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je L$mulx4x_enter pushq %rbx pushq %rbp @@ -602,7 +618,6 @@ L$inner4x: movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax - pxor %xmm0,%xmm0 movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi @@ -612,8 +627,7 @@ L$inner4x: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - jmp L$sub4x -.p2align 4 + L$sub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) @@ -640,34 +654,35 @@ L$sub4x: sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) - xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx - leaq -4(%r9),%r15 - orq %rcx,%rsi + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 shrq $2,%r15 + xorl %eax,%eax - movdqu (%rsi),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,(%rdi) jmp L$copy4x .p2align 4 L$copy4x: - movdqu 16(%rsi,%r14,1),%xmm2 - movdqu 32(%rsi,%r14,1),%xmm1 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movdqa %xmm0,32(%rsp,%r14,1) - movdqu %xmm1,32(%rdi,%r14,1) - leaq 32(%r14),%r14 + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax decq %r15 jnz L$copy4x - - movdqu 16(%rsi,%r14,1),%xmm2 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi movq $1,%rax @@ -692,6 +707,7 @@ L$mul4x_epilogue: + .p2align 5 bn_sqr8x_mont: @@ -772,6 +788,26 @@ L$sqr8x_body: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + leaq _OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne L$sqr8x_nox + + call _bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_nox: call _bn_sqr8x_internal @@ -859,6 +895,362 @@ L$sqr8x_epilogue: .byte 0xf3,0xc3 + +.p2align 5 +bn_mulx4x_mont: + + movq %rsp,%rax + +L$mulx4x_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +.p2align 4 +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) + + movq %r9,48(%rsp) + jmp L$mulx4x_body + +.p2align 5 +L$mulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne L$mulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp L$mulx4x_sub + +.p2align 5 +L$mulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz L$mulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + + jmp L$mulx4x_cond_copy + +.p2align 5 +L$mulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz L$mulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + .byte 0xf3,0xc3 + + .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S index abc65f1192..4bd36feae4 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -1,4 +1,16 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif .text @@ -14,6 +26,8 @@ _bn_mul_mont_gather5: testl $7,%r9d jnz L$mul_enter + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d jmp L$mul4x_enter .p2align 4 @@ -395,8 +409,7 @@ L$inner_enter: movq %r9,%r15 jmp L$sub .p2align 4 -L$sub: - sbbq (%rcx,%r14,8),%rax +L$sub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -404,18 +417,19 @@ L$sub: jnz L$sub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.p2align 4 + L$copy: - movq (%rsi,%r14,8),%rax + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy @@ -450,6 +464,9 @@ bn_mul4x_mont_gather5: movq %rsp,%rax L$mul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je L$mulx4x_enter pushq %rbx pushq %rbp @@ -548,6 +565,7 @@ L$mul4x_epilogue: .p2align 5 mul4x_internal: + shlq $5,%r9 movd 8(%rax),%xmm5 leaq L$inc(%rip),%rax @@ -1070,6 +1088,7 @@ L$inner4x: movq 24(%rbp),%r15 jmp L$sqr4x_sub_entry + .globl _bn_power5 .private_extern _bn_power5 @@ -1078,6 +1097,11 @@ _bn_power5: movq %rsp,%rax + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je L$powerx5_enter pushq %rbx pushq %rbp @@ -1280,6 +1304,7 @@ __bn_sqr8x_internal: + leaq 32(%r10),%rbp @@ -1985,8 +2010,10 @@ L$8x_no_tail: .byte 0xf3,0xc3 + .p2align 5 __bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx @@ -2038,11 +2065,13 @@ L$sqr4x_sub_entry: negq %r9 .byte 0xf3,0xc3 + .globl _bn_from_montgomery .private_extern _bn_from_montgomery .p2align 5 _bn_from_montgomery: + testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax @@ -2050,6 +2079,7 @@ _bn_from_montgomery: + .p2align 5 bn_from_mont8x: @@ -2163,6 +2193,22 @@ L$mul_by_1: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne L$from_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp L$from_mont_zero + +.p2align 5 +L$from_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2201,11 +2247,1356 @@ L$from_epilogue: .byte 0xf3,0xc3 + +.p2align 5 +bn_mulx4x_mont_gather5: + + movq %rsp,%rax + +L$mulx4x_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$mulx4xsp_done + +L$mulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$mulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$mulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +mulx4x_internal: + + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq L$inc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb L$mulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + + + +.p2align 5 +bn_powerx5: + + movq %rsp,%rax + +L$powerx5_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$powerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$pwrx_sp_done + +.p2align 5 +L$pwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$pwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk + jmp L$pwrx_page_walk_done + +L$pwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk +L$pwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$powerx5_epilogue: + .byte 0xf3,0xc3 + + + +.globl _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal + +.p2align 5 +_bn_sqrx8x_internal: +__bn_sqrx8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp L$sqr8x_zero_start + +.p2align 5 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +L$sqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +L$sqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz L$sqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je L$sqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz L$sqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je L$sqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je L$sqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.p2align 5 +L$sqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz L$sqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp L$sqrx4x_shift_n_add + +.p2align 5 +L$sqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp L$sqrx8x_reduction_loop + +.p2align 5 +L$sqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp L$sqrx8x_reduce + +.p2align 5 +L$sqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz L$sqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz L$sqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +L$sqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb L$sqrx8x_reduction_loop + .byte 0xf3,0xc3 + + +.p2align 5 + +__bn_postx4x_internal: + + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + +.p2align 4 +L$sqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz L$sqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 + + .globl _bn_scatter5 .private_extern _bn_scatter5 .p2align 4 _bn_scatter5: + cmpl $0,%esi jz L$scatter_epilogue leaq (%rdx,%rcx,8),%rdx @@ -2220,14 +3611,17 @@ L$scatter_epilogue: .byte 0xf3,0xc3 + .globl _bn_gather5 .private_extern _bn_gather5 .p2align 5 _bn_gather5: + L$SEH_begin_bn_gather5: .byte 0x4c,0x8d,0x14,0x24 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq L$inc(%rip),%rax andq $-16,%rsp @@ -2381,9 +3775,11 @@ L$gather: jnz L$gather leaq (%r10),%rsp + .byte 0xf3,0xc3 L$SEH_end_bn_gather5: + .p2align 6 L$inc: .long 0,0, 1,1 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S new file mode 100644 index 0000000000..863e6b0452 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/test/trampoline-x86_64.S @@ -0,0 +1,513 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + + + + + + + + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.p2align 4 +_abi_test_trampoline: +L$abi_test_trampoline_seh_begin: + + + + + + + + + + + subq $120,%rsp + +L$abi_test_trampoline_seh_prolog_alloc: + movq %r8,48(%rsp) + movq %rbx,64(%rsp) + +L$abi_test_trampoline_seh_prolog_rbx: + movq %rbp,72(%rsp) + +L$abi_test_trampoline_seh_prolog_rbp: + movq %r12,80(%rsp) + +L$abi_test_trampoline_seh_prolog_r12: + movq %r13,88(%rsp) + +L$abi_test_trampoline_seh_prolog_r13: + movq %r14,96(%rsp) + +L$abi_test_trampoline_seh_prolog_r14: + movq %r15,104(%rsp) + +L$abi_test_trampoline_seh_prolog_r15: +L$abi_test_trampoline_seh_prolog_end: + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + + + + + movq %rdx,%r10 + movq %rcx,%r11 + decq %r11 + js L$args_done + movq (%r10),%rdi + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rsi + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rdx + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rcx + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%r8 + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%r9 + addq $8,%r10 + leaq 0(%rsp),%rax +L$args_loop: + decq %r11 + js L$args_done + + + + + + + movq %r11,56(%rsp) + movq (%r10),%r11 + movq %r11,(%rax) + movq 56(%rsp),%r11 + + addq $8,%r10 + addq $8,%rax + jmp L$args_loop + +L$args_done: + movq 32(%rsp),%rax + movq 48(%rsp),%r10 + testq %r10,%r10 + jz L$no_unwind + + + pushfq + orq $0x100,0(%rsp) + popfq + + + + nop +.globl _abi_test_unwind_start +.private_extern _abi_test_unwind_start +_abi_test_unwind_start: + + call *%rax +.globl _abi_test_unwind_return +.private_extern _abi_test_unwind_return +_abi_test_unwind_return: + + + + + pushfq + andq $-0x101,0(%rsp) + popfq +.globl _abi_test_unwind_stop +.private_extern _abi_test_unwind_stop +_abi_test_unwind_stop: + + jmp L$call_done + +L$no_unwind: + call *%rax + +L$call_done: + + movq 40(%rsp),%rsi + movq %rbx,0(%rsi) + movq %rbp,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq 64(%rsp),%rbx + + movq 72(%rsp),%rbp + + movq 80(%rsp),%r12 + + movq 88(%rsp),%r13 + + movq 96(%rsp),%r14 + + movq 104(%rsp),%r15 + + addq $120,%rsp + + + + .byte 0xf3,0xc3 + +L$abi_test_trampoline_seh_end: + + +.globl _abi_test_clobber_rax +.private_extern _abi_test_clobber_rax +.p2align 4 +_abi_test_clobber_rax: + xorq %rax,%rax + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rbx +.private_extern _abi_test_clobber_rbx +.p2align 4 +_abi_test_clobber_rbx: + xorq %rbx,%rbx + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rcx +.private_extern _abi_test_clobber_rcx +.p2align 4 +_abi_test_clobber_rcx: + xorq %rcx,%rcx + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rdx +.private_extern _abi_test_clobber_rdx +.p2align 4 +_abi_test_clobber_rdx: + xorq %rdx,%rdx + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rdi +.private_extern _abi_test_clobber_rdi +.p2align 4 +_abi_test_clobber_rdi: + xorq %rdi,%rdi + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rsi +.private_extern _abi_test_clobber_rsi +.p2align 4 +_abi_test_clobber_rsi: + xorq %rsi,%rsi + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_rbp +.private_extern _abi_test_clobber_rbp +.p2align 4 +_abi_test_clobber_rbp: + xorq %rbp,%rbp + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r8 +.private_extern _abi_test_clobber_r8 +.p2align 4 +_abi_test_clobber_r8: + xorq %r8,%r8 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r9 +.private_extern _abi_test_clobber_r9 +.p2align 4 +_abi_test_clobber_r9: + xorq %r9,%r9 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r10 +.private_extern _abi_test_clobber_r10 +.p2align 4 +_abi_test_clobber_r10: + xorq %r10,%r10 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r11 +.private_extern _abi_test_clobber_r11 +.p2align 4 +_abi_test_clobber_r11: + xorq %r11,%r11 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r12 +.private_extern _abi_test_clobber_r12 +.p2align 4 +_abi_test_clobber_r12: + xorq %r12,%r12 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r13 +.private_extern _abi_test_clobber_r13 +.p2align 4 +_abi_test_clobber_r13: + xorq %r13,%r13 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r14 +.private_extern _abi_test_clobber_r14 +.p2align 4 +_abi_test_clobber_r14: + xorq %r14,%r14 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_r15 +.private_extern _abi_test_clobber_r15 +.p2align 4 +_abi_test_clobber_r15: + xorq %r15,%r15 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm0 +.private_extern _abi_test_clobber_xmm0 +.p2align 4 +_abi_test_clobber_xmm0: + pxor %xmm0,%xmm0 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm1 +.private_extern _abi_test_clobber_xmm1 +.p2align 4 +_abi_test_clobber_xmm1: + pxor %xmm1,%xmm1 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm2 +.private_extern _abi_test_clobber_xmm2 +.p2align 4 +_abi_test_clobber_xmm2: + pxor %xmm2,%xmm2 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm3 +.private_extern _abi_test_clobber_xmm3 +.p2align 4 +_abi_test_clobber_xmm3: + pxor %xmm3,%xmm3 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm4 +.private_extern _abi_test_clobber_xmm4 +.p2align 4 +_abi_test_clobber_xmm4: + pxor %xmm4,%xmm4 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm5 +.private_extern _abi_test_clobber_xmm5 +.p2align 4 +_abi_test_clobber_xmm5: + pxor %xmm5,%xmm5 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm6 +.private_extern _abi_test_clobber_xmm6 +.p2align 4 +_abi_test_clobber_xmm6: + pxor %xmm6,%xmm6 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm7 +.private_extern _abi_test_clobber_xmm7 +.p2align 4 +_abi_test_clobber_xmm7: + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm8 +.private_extern _abi_test_clobber_xmm8 +.p2align 4 +_abi_test_clobber_xmm8: + pxor %xmm8,%xmm8 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm9 +.private_extern _abi_test_clobber_xmm9 +.p2align 4 +_abi_test_clobber_xmm9: + pxor %xmm9,%xmm9 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm10 +.private_extern _abi_test_clobber_xmm10 +.p2align 4 +_abi_test_clobber_xmm10: + pxor %xmm10,%xmm10 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm11 +.private_extern _abi_test_clobber_xmm11 +.p2align 4 +_abi_test_clobber_xmm11: + pxor %xmm11,%xmm11 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm12 +.private_extern _abi_test_clobber_xmm12 +.p2align 4 +_abi_test_clobber_xmm12: + pxor %xmm12,%xmm12 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm13 +.private_extern _abi_test_clobber_xmm13 +.p2align 4 +_abi_test_clobber_xmm13: + pxor %xmm13,%xmm13 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm14 +.private_extern _abi_test_clobber_xmm14 +.p2align 4 +_abi_test_clobber_xmm14: + pxor %xmm14,%xmm14 + .byte 0xf3,0xc3 + + +.globl _abi_test_clobber_xmm15 +.private_extern _abi_test_clobber_xmm15 +.p2align 4 +_abi_test_clobber_xmm15: + pxor %xmm15,%xmm15 + .byte 0xf3,0xc3 + + + + + +.globl _abi_test_bad_unwind_wrong_register +.private_extern _abi_test_bad_unwind_wrong_register +.p2align 4 +_abi_test_bad_unwind_wrong_register: + +L$abi_test_bad_unwind_wrong_register_seh_begin: + pushq %r12 + +L$abi_test_bad_unwind_wrong_register_seh_push_r13: + + + + nop + popq %r12 + + .byte 0xf3,0xc3 +L$abi_test_bad_unwind_wrong_register_seh_end: + + + + + + + +.globl _abi_test_bad_unwind_temporary +.private_extern _abi_test_bad_unwind_temporary +.p2align 4 +_abi_test_bad_unwind_temporary: + +L$abi_test_bad_unwind_temporary_seh_begin: + pushq %r12 + +L$abi_test_bad_unwind_temporary_seh_push_r12: + + movq %r12,%rax + incq %rax + movq %rax,(%rsp) + + + + movq %r12,(%rsp) + + + popq %r12 + + .byte 0xf3,0xc3 +L$abi_test_bad_unwind_temporary_seh_end: + + + + + + + +.globl _abi_test_get_and_clear_direction_flag +.private_extern _abi_test_get_and_clear_direction_flag +_abi_test_get_and_clear_direction_flag: + pushfq + popq %rax + andq $0x400,%rax + shrq $10,%rax + cld + .byte 0xf3,0xc3 + + + + + +.globl _abi_test_set_direction_flag +.private_extern _abi_test_set_direction_flag +_abi_test_set_direction_flag: + std + .byte 0xf3,0xc3 + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S new file mode 100644 index 0000000000..f1e7ea4f63 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/third_party/sike/asm/fp-x86_64.S @@ -0,0 +1,1869 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +L$p434x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF +.quad 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC +.quad 0x0004683E4E2EE688 + + +L$p434p1: +.quad 0xFDC1767AE3000000 +.quad 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056 +.quad 0x0002341F27177344 + + +.private_extern _OPENSSL_ia32cap_P +.globl _sike_fpadd +.private_extern _sike_fpadd + +_sike_fpadd: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + + xorq %rax,%rax + + movq 0(%rdi),%r8 + addq 0(%rsi),%r8 + movq 8(%rdi),%r9 + adcq 8(%rsi),%r9 + movq 16(%rdi),%r10 + adcq 16(%rsi),%r10 + movq 24(%rdi),%r11 + adcq 24(%rsi),%r11 + movq 32(%rdi),%r12 + adcq 32(%rsi),%r12 + movq 40(%rdi),%r13 + adcq 40(%rsi),%r13 + movq 48(%rdi),%r14 + adcq 48(%rsi),%r14 + + movq L$p434x2(%rip),%rcx + subq %rcx,%r8 + movq 8+L$p434x2(%rip),%rcx + sbbq %rcx,%r9 + sbbq %rcx,%r10 + movq 16+L$p434x2(%rip),%rcx + sbbq %rcx,%r11 + movq 24+L$p434x2(%rip),%rcx + sbbq %rcx,%r12 + movq 32+L$p434x2(%rip),%rcx + sbbq %rcx,%r13 + movq 40+L$p434x2(%rip),%rcx + sbbq %rcx,%r14 + + sbbq $0,%rax + + movq L$p434x2(%rip),%rdi + andq %rax,%rdi + movq 8+L$p434x2(%rip),%rsi + andq %rax,%rsi + movq 16+L$p434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+L$p434x2(%rip),%r8 + andq %rax,%r8 + movq 32+L$p434x2(%rip),%r9 + andq %rax,%r9 + movq 40+L$p434x2(%rip),%r10 + andq %rax,%r10 + btq $0,%rcx + + adcq %r8,%r12 + movq %r12,32(%rdx) + adcq %r9,%r13 + movq %r13,40(%rdx) + adcq %r10,%r14 + movq %r14,48(%rdx) + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +.globl _sike_cswap_asm +.private_extern _sike_cswap_asm + +_sike_cswap_asm: + + + movq %rdx,%xmm3 + + + + + + pshufd $68,%xmm3,%xmm3 + + movdqu 0(%rdi),%xmm0 + movdqu 0(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,0(%rdi) + movdqu %xmm1,0(%rsi) + + movdqu 16(%rdi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,16(%rdi) + movdqu %xmm1,16(%rsi) + + movdqu 32(%rdi),%xmm0 + movdqu 32(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,32(%rdi) + movdqu %xmm1,32(%rsi) + + movdqu 48(%rdi),%xmm0 + movdqu 48(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,48(%rdi) + movdqu %xmm1,48(%rsi) + + movdqu 64(%rdi),%xmm0 + movdqu 64(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,64(%rdi) + movdqu %xmm1,64(%rsi) + + movdqu 80(%rdi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,80(%rdi) + movdqu %xmm1,80(%rsi) + + movdqu 96(%rdi),%xmm0 + movdqu 96(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,96(%rdi) + movdqu %xmm1,96(%rsi) + + movdqu 112(%rdi),%xmm0 + movdqu 112(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,112(%rdi) + movdqu %xmm1,112(%rsi) + + movdqu 128(%rdi),%xmm0 + movdqu 128(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,128(%rdi) + movdqu %xmm1,128(%rsi) + + movdqu 144(%rdi),%xmm0 + movdqu 144(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,144(%rdi) + movdqu %xmm1,144(%rsi) + + movdqu 160(%rdi),%xmm0 + movdqu 160(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,160(%rdi) + movdqu %xmm1,160(%rsi) + + movdqu 176(%rdi),%xmm0 + movdqu 176(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,176(%rdi) + movdqu %xmm1,176(%rsi) + + movdqu 192(%rdi),%xmm0 + movdqu 192(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,192(%rdi) + movdqu %xmm1,192(%rsi) + + movdqu 208(%rdi),%xmm0 + movdqu 208(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,208(%rdi) + movdqu %xmm1,208(%rsi) + + .byte 0xf3,0xc3 +.globl _sike_fpsub +.private_extern _sike_fpsub + +_sike_fpsub: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + + xorq %rax,%rax + + movq 0(%rdi),%r8 + subq 0(%rsi),%r8 + movq 8(%rdi),%r9 + sbbq 8(%rsi),%r9 + movq 16(%rdi),%r10 + sbbq 16(%rsi),%r10 + movq 24(%rdi),%r11 + sbbq 24(%rsi),%r11 + movq 32(%rdi),%r12 + sbbq 32(%rsi),%r12 + movq 40(%rdi),%r13 + sbbq 40(%rsi),%r13 + movq 48(%rdi),%r14 + sbbq 48(%rsi),%r14 + + sbbq $0x0,%rax + + movq L$p434x2(%rip),%rdi + andq %rax,%rdi + movq 8+L$p434x2(%rip),%rsi + andq %rax,%rsi + movq 16+L$p434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+L$p434x2(%rip),%r8 + andq %rax,%r8 + movq 32+L$p434x2(%rip),%r9 + andq %rax,%r9 + movq 40+L$p434x2(%rip),%r10 + andq %rax,%r10 + btq $0x0,%rcx + + adcq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +.globl _sike_mpadd_asm +.private_extern _sike_mpadd_asm + +_sike_mpadd_asm: + + movq 0(%rdi),%r8; + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + adcq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + adcq 40(%rsi),%r8 + adcq 48(%rsi),%r9 + movq %r8,40(%rdx) + movq %r9,48(%rdx) + .byte 0xf3,0xc3 + +.globl _sike_mpsubx2_asm +.private_extern _sike_mpsubx2_asm + +_sike_mpsubx2_asm: + + xorq %rax,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + movq 56(%rdi),%r10 + movq 64(%rdi),%r11 + movq 72(%rdi),%rcx + sbbq 40(%rsi),%r8 + sbbq 48(%rsi),%r9 + sbbq 56(%rsi),%r10 + sbbq 64(%rsi),%r11 + sbbq 72(%rsi),%rcx + movq %r8,40(%rdx) + movq %r9,48(%rdx) + movq %r10,56(%rdx) + movq %r11,64(%rdx) + movq %rcx,72(%rdx) + + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + sbbq 80(%rsi),%r8 + sbbq 88(%rsi),%r9 + sbbq 96(%rsi),%r10 + sbbq 104(%rsi),%r11 + sbbq $0x0,%rax + movq %r8,80(%rdx) + movq %r9,88(%rdx) + movq %r10,96(%rdx) + movq %r11,104(%rdx) + .byte 0xf3,0xc3 + +.globl _sike_mpdblsubx2_asm +.private_extern _sike_mpdblsubx2_asm + +_sike_mpdblsubx2_asm: + + pushq %r12 + + + pushq %r13 + + + + xorq %rax,%rax + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + movq 48(%rdx),%rcx + subq 0(%rdi),%r8 + sbbq 8(%rdi),%r9 + sbbq 16(%rdi),%r10 + sbbq 24(%rdi),%r11 + sbbq 32(%rdi),%r12 + sbbq 40(%rdi),%r13 + sbbq 48(%rdi),%rcx + adcq $0x0,%rax + + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%r12 + sbbq 40(%rsi),%r13 + sbbq 48(%rsi),%rcx + adcq $0x0,%rax + + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %rcx,48(%rdx) + + + movq 56(%rdx),%r8 + movq 64(%rdx),%r9 + movq 72(%rdx),%r10 + movq 80(%rdx),%r11 + movq 88(%rdx),%r12 + movq 96(%rdx),%r13 + movq 104(%rdx),%rcx + + subq %rax,%r8 + sbbq 56(%rdi),%r8 + sbbq 64(%rdi),%r9 + sbbq 72(%rdi),%r10 + sbbq 80(%rdi),%r11 + sbbq 88(%rdi),%r12 + sbbq 96(%rdi),%r13 + sbbq 104(%rdi),%rcx + + + subq 56(%rsi),%r8 + sbbq 64(%rsi),%r9 + sbbq 72(%rsi),%r10 + sbbq 80(%rsi),%r11 + sbbq 88(%rsi),%r12 + sbbq 96(%rsi),%r13 + sbbq 104(%rsi),%rcx + + + movq %r8,56(%rdx) + movq %r9,64(%rdx) + movq %r10,72(%rdx) + movq %r11,80(%rdx) + movq %r12,88(%rdx) + movq %r13,96(%rdx) + movq %rcx,104(%rdx) + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + + +L$rdc_bdw: + + + + + + + + + xorq %rax,%rax + movq 0+0(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 0+8(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+L$p434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + xorq %rax,%rax + movq 16+0(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 16+8(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+L$p434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + xorq %rax,%rax + movq 32+0(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 32+8(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+L$p434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+L$p434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + xorq %rax,%rax + movq 48(%rdi),%rdx + mulxq 0+L$p434p1(%rip),%r8,%r9 + mulxq 8+L$p434p1(%rip),%r12,%r10 + mulxq 16+L$p434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+L$p434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 + + + popq %r14 + + + popq %r13 + + + popq %r12 + + + .byte 0xf3,0xc3 + +.globl _sike_fprdc +.private_extern _sike_fprdc + +_sike_fprdc: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + + + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$rdc_bdw + + + + + movq 0+0(%rdi),%r14 + movq 0+L$p434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+L$p434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 0+8(%rdi),%rcx + movq 0+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+L$p434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + + movq 16+0(%rdi),%r14 + movq 0+L$p434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+L$p434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 16+8(%rdi),%rcx + movq 0+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+L$p434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + + movq 32+0(%rdi),%r14 + movq 0+L$p434p1(%rip),%rax + mulq %r14 + xorq %r10,%r10 + movq %rax,%r8 + movq %rdx,%r9 + + + movq 8+L$p434p1(%rip),%rax + mulq %r14 + xorq %r11,%r11 + addq %rax,%r9 + adcq %rdx,%r10 + + + movq 32+8(%rdi),%rcx + movq 0+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 8+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r10 + adcq %rdx,%r11 + adcq $0x0,%r12 + + + movq 24+L$p434p1(%rip),%rax + mulq %r14 + xorq %r13,%r13 + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 16+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0x0,%r13 + + + movq 24+L$p434p1(%rip),%rax + mulq %rcx + addq %rax,%r12 + adcq %rdx,%r13 + + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + movq 48(%rdi),%r13 + + xorq %r10,%r10 + movq 0+L$p434p1(%rip),%rax + mulq %r13 + movq %rax,%r8 + movq %rdx,%r9 + + xorq %r11,%r11 + movq 8+L$p434p1(%rip),%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + + xorq %r12,%r12 + movq 16+L$p434p1(%rip),%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r11 + + movq 24+L$p434p1(%rip),%rax + mulq %r13 + addq %rax,%r11 + adcq %rdx,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +L$mul_bdw: + + + + + + + + + + movq %rdx,%rcx + xorq %rax,%rax + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + pushq %rbx + + + pushq %rbp + + + subq $96,%rsp + + + addq 32(%rdi),%r8 + adcq 40(%rdi),%r9 + adcq 48(%rdi),%r10 + adcq $0x0,%r11 + sbbq $0x0,%rax + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + + + xorq %rbx,%rbx + movq 0(%rsi),%r12 + movq 8(%rsi),%r13 + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + addq 32(%rsi),%r12 + adcq 40(%rsi),%r13 + adcq 48(%rsi),%r14 + adcq $0x0,%r15 + sbbq $0x0,%rbx + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + + + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + andq %rbx,%r8 + andq %rbx,%r9 + andq %rbx,%r10 + andq %rbx,%r11 + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + + movq 0+0(%rsp),%rdx + mulxq 32+0(%rsp),%r9,%r8 + movq %r9,0+0(%rsp) + mulxq 32+8(%rsp),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsp),%r11,%r10 + adoxq %r11,%r9 + mulxq 32+24(%rsp),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rsp),%rdx + mulxq 32+0(%rsp),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rsp) + adcxq %r15,%r13 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 32+8(%rsp),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rsp) + adcxq %r11,%r9 + mulxq 32+16(%rsp),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 32+24(%rsp),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r13 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rsp) + movq %r10,0+32(%rsp) + movq %r11,0+40(%rsp) + movq %r12,0+48(%rsp) + movq %rbx,0+56(%rsp) + + + + movq 0+0(%rdi),%rdx + mulxq 0+0(%rsi),%r9,%r8 + movq %r9,0+0(%rcx) + mulxq 0+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 0+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + mulxq 0+24(%rsi),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rdi),%rdx + mulxq 0+0(%rsi),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rcx) + adcxq %r15,%r13 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 0+8(%rsi),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rcx) + adcxq %r11,%r9 + mulxq 0+16(%rsi),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 0+24(%rsi),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r13 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rcx) + movq %r10,0+32(%rcx) + movq %r11,0+40(%rcx) + movq %r12,0+48(%rcx) + movq %rbx,0+56(%rcx) + + + + movq 32+0(%rdi),%rdx + mulxq 32+0(%rsi),%r9,%r8 + movq %r9,64+0(%rcx) + mulxq 32+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + + movq 32+8(%rdi),%rdx + mulxq 32+0(%rsi),%r12,%r11 + adoxq %rax,%r10 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r14,%r13 + adoxq %r8,%r12 + movq %r12,64+8(%rcx) + adcxq %r14,%r11 + + mulxq 32+16(%rsi),%r8,%r14 + adoxq %r9,%r11 + adcxq %r8,%r13 + adcxq %rax,%r14 + adoxq %r10,%r13 + + movq 32+16(%rdi),%rdx + mulxq 32+0(%rsi),%r8,%r9 + adoxq %rax,%r14 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r10,%r12 + adoxq %r11,%r8 + movq %r8,64+16(%rcx) + adcxq %r13,%r9 + + mulxq 32+16(%rsi),%r11,%r8 + adcxq %r14,%r12 + adcxq %rax,%r8 + adoxq %r10,%r9 + adoxq %r12,%r11 + adoxq %rax,%r8 + movq %r9,64+24(%rcx) + movq %r11,64+32(%rcx) + movq %r8,64+40(%rcx) + + + + + movq 64(%rsp),%r8 + movq 72(%rsp),%r9 + movq 80(%rsp),%r10 + movq 88(%rsp),%r11 + + movq 32(%rsp),%rax + addq %rax,%r8 + movq 40(%rsp),%rax + adcq %rax,%r9 + movq 48(%rsp),%rax + adcq %rax,%r10 + movq 56(%rsp),%rax + adcq %rax,%r11 + + + movq 0(%rsp),%r12 + movq 8(%rsp),%r13 + movq 16(%rsp),%r14 + movq 24(%rsp),%r15 + subq 0(%rcx),%r12 + sbbq 8(%rcx),%r13 + sbbq 16(%rcx),%r14 + sbbq 24(%rcx),%r15 + sbbq 32(%rcx),%r8 + sbbq 40(%rcx),%r9 + sbbq 48(%rcx),%r10 + sbbq 56(%rcx),%r11 + + + subq 64(%rcx),%r12 + sbbq 72(%rcx),%r13 + sbbq 80(%rcx),%r14 + sbbq 88(%rcx),%r15 + sbbq 96(%rcx),%r8 + sbbq 104(%rcx),%r9 + sbbq $0x0,%r10 + sbbq $0x0,%r11 + + addq 32(%rcx),%r12 + movq %r12,32(%rcx) + adcq 40(%rcx),%r13 + movq %r13,40(%rcx) + adcq 48(%rcx),%r14 + movq %r14,48(%rcx) + adcq 56(%rcx),%r15 + movq %r15,56(%rcx) + adcq 64(%rcx),%r8 + movq %r8,64(%rcx) + adcq 72(%rcx),%r9 + movq %r9,72(%rcx) + adcq 80(%rcx),%r10 + movq %r10,80(%rcx) + adcq 88(%rcx),%r11 + movq %r11,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + movq 104(%rcx),%r13 + adcq $0x0,%r13 + movq %r13,104(%rcx) + + addq $96,%rsp + + popq %rbp + + + popq %rbx + + + + + popq %r15 + + + popq %r14 + + + popq %r13 + + + popq %r12 + + + .byte 0xf3,0xc3 + + +.globl _sike_mpmul +.private_extern _sike_mpmul + +_sike_mpmul: + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + + + + leaq _OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je L$mul_bdw + + + + movq %rdx,%rcx + + subq $112,%rsp + + + + xorq %rax,%rax + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + xorq %r11,%r11 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + + sbbq $0,%rax + movq %rax,64(%rsp) + + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + + + xorq %rdx,%rdx + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + xorq %r15,%r15 + addq 0(%rsi),%r12 + adcq 8(%rsi),%r13 + adcq 16(%rsi),%r14 + adcq 24(%rsi),%r15 + sbbq $0x0,%rdx + + movq %rdx,72(%rsp) + + + movq (%rcx),%rax + mulq %r12 + movq %rax,(%rsp) + movq %rdx,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq (%rcx),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rcx),%rax + mulq %r12 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rcx),%rax + mulq %r13 + addq %rax,%r9 + movq %r9,16(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq (%rcx),%rax + mulq %r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 24(%rcx),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rcx),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rcx),%rax + mulq %r13 + addq %rax,%r10 + movq %r10,24(%rsp) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 8(%rcx),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 24(%rcx),%rax + mulq %r13 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rcx),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rsp) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r11,%r11 + movq 16(%rcx),%rax + mulq %r15 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r14 + addq %rax,%r9 + movq %r9,40(%rsp) + adcq %rdx,%r10 + adcq $0x0,%r11 + + movq 24(%rcx),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rsp) + adcq %rdx,%r11 + movq %r11,56(%rsp) + + + movq 64(%rsp),%rax + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + movq 72(%rsp),%rax + movq 0(%rcx),%r8 + andq %rax,%r8 + movq 8(%rcx),%r9 + andq %rax,%r9 + movq 16(%rcx),%r10 + andq %rax,%r10 + movq 24(%rcx),%r11 + andq %rax,%r11 + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + + + movq 32(%rsp),%rax + addq %rax,%r12 + movq 40(%rsp),%rax + adcq %rax,%r13 + movq 48(%rsp),%rax + adcq %rax,%r14 + movq 56(%rsp),%rax + adcq %rax,%r15 + movq %r12,80(%rsp) + movq %r13,88(%rsp) + movq %r14,96(%rsp) + movq %r15,104(%rsp) + + + movq (%rdi),%r11 + movq (%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,(%rcx) + movq %rdx,%r8 + + movq 16(%rdi),%r14 + movq 8(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 8(%rdi),%r12 + movq (%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,8(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 16(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq (%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 8(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,16(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + xorq %r9,%r9 + movq 24(%rsi),%rax + mulq %r11 + movq 24(%rdi),%r15 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq %r15,%rax + mulq %r13 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 16(%rsi),%rax + mulq %r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r9 + + movq 8(%rsi),%rax + mulq %r14 + addq %rax,%r10 + movq %r10,24(%rcx) + adcq %rdx,%r8 + adcq $0x0,%r9 + + xorq %r10,%r10 + movq 24(%rsi),%rax + mulq %r12 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 8(%rsi),%rax + mulq %r15 + addq %rax,%r8 + adcq %rdx,%r9 + adcq $0x0,%r10 + + movq 16(%rsi),%rax + mulq %r14 + addq %rax,%r8 + movq %r8,32(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 24(%rsi),%rax + mulq %r14 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 16(%rsi),%rax + mulq %r15 + addq %rax,%r9 + movq %r9,40(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 24(%rsi),%rax + mulq %r15 + addq %rax,%r10 + movq %r10,48(%rcx) + adcq %rdx,%r8 + movq %r8,56(%rcx) + + + + movq 32(%rdi),%r11 + movq 32(%rsi),%rax + mulq %r11 + xorq %r9,%r9 + movq %rax,64(%rcx) + movq %rdx,%r8 + + movq 48(%rdi),%r14 + movq 40(%rsi),%rax + mulq %r11 + xorq %r10,%r10 + addq %rax,%r8 + adcq %rdx,%r9 + + movq 40(%rdi),%r12 + movq 32(%rsi),%rax + mulq %r12 + addq %rax,%r8 + movq %r8,72(%rcx) + adcq %rdx,%r9 + adcq $0x0,%r10 + + xorq %r8,%r8 + movq 48(%rsi),%rax + mulq %r11 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 32(%rsi),%r13 + movq %r14,%rax + mulq %r13 + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 40(%rsi),%rax + mulq %r12 + addq %rax,%r9 + movq %r9,80(%rcx) + adcq %rdx,%r10 + adcq $0x0,%r8 + + movq 48(%rsi),%rax + mulq %r12 + xorq %r12,%r12 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + + movq 40(%rsi),%rax + mulq %r14 + addq %rax,%r10 + adcq %rdx,%r8 + adcq $0x0,%r12 + movq %r10,88(%rcx) + + movq 48(%rsi),%rax + mulq %r14 + addq %rax,%r8 + adcq $0x0,%r12 + movq %r8,96(%rcx) + + addq %r12,%rdx + + + movq 0(%rsp),%r8 + subq 0(%rcx),%r8 + movq 8(%rsp),%r9 + sbbq 8(%rcx),%r9 + movq 16(%rsp),%r10 + sbbq 16(%rcx),%r10 + movq 24(%rsp),%r11 + sbbq 24(%rcx),%r11 + movq 80(%rsp),%r12 + sbbq 32(%rcx),%r12 + movq 88(%rsp),%r13 + sbbq 40(%rcx),%r13 + movq 96(%rsp),%r14 + sbbq 48(%rcx),%r14 + movq 104(%rsp),%r15 + sbbq 56(%rcx),%r15 + + + movq 64(%rcx),%rax + subq %rax,%r8 + movq 72(%rcx),%rax + sbbq %rax,%r9 + movq 80(%rcx),%rax + sbbq %rax,%r10 + movq 88(%rcx),%rax + sbbq %rax,%r11 + movq 96(%rcx),%rax + sbbq %rax,%r12 + sbbq %rdx,%r13 + sbbq $0x0,%r14 + sbbq $0x0,%r15 + + + addq 32(%rcx),%r8 + movq %r8,32(%rcx) + adcq 40(%rcx),%r9 + movq %r9,40(%rcx) + adcq 48(%rcx),%r10 + movq %r10,48(%rcx) + adcq 56(%rcx),%r11 + movq %r11,56(%rcx) + adcq 64(%rcx),%r12 + movq %r12,64(%rcx) + adcq 72(%rcx),%r13 + movq %r13,72(%rcx) + adcq 80(%rcx),%r14 + movq %r14,80(%rcx) + adcq 88(%rcx),%r15 + movq %r15,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + adcq $0x0,%rdx + movq %rdx,104(%rcx) + + addq $112,%rsp + + + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + +#endif diff --git a/packager/third_party/boringssl/roll_boringssl.py b/packager/third_party/boringssl/roll_boringssl.py index 41c2ed1e50..f1f009c2f7 100755 --- a/packager/third_party/boringssl/roll_boringssl.py +++ b/packager/third_party/boringssl/roll_boringssl.py @@ -30,7 +30,6 @@ GENERATED_FILES = [ 'BUILD.generated.gni', 'BUILD.generated_tests.gni', 'boringssl.gypi', - 'boringssl_tests.gypi', 'err_data.c', ] @@ -91,10 +90,12 @@ def main(): # Clear the old generated files. for (osname, arch, _, _, _) in generate_build_files.OS_ARCH_COMBOS: path = os.path.join(BORINGSSL_PATH, osname + '-' + arch) - shutil.rmtree(path) + if os.path.exists(path): + shutil.rmtree(path) for file in GENERATED_FILES: path = os.path.join(BORINGSSL_PATH, file) - os.unlink(path) + if os.path.exists(path): + os.unlink(path) # Generate new ones. subprocess.check_call(['python', diff --git a/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm b/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm index 3ba31a2b35..7b59adf1db 100644 --- a/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm index 42ca0267e7..c3a47d88f2 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -972,10 +978,10 @@ dd 1,2,4,8 dd 16,32,64,128 dd 27,54,0,0 dd 0,0,0,0 -global _asm_AES_encrypt +global _aes_nohw_encrypt align 16 -_asm_AES_encrypt: -L$_asm_AES_encrypt_begin: +_aes_nohw_encrypt: +L$_aes_nohw_encrypt_begin: push ebp push ebx push esi @@ -2152,10 +2158,10 @@ db 160,224,59,77,174,42,245,176 db 200,235,187,60,131,83,153,97 db 23,43,4,126,186,119,214,38 db 225,105,20,99,85,33,12,125 -global _asm_AES_decrypt +global _aes_nohw_decrypt align 16 -_asm_AES_decrypt: -L$_asm_AES_decrypt_begin: +_aes_nohw_decrypt: +L$_aes_nohw_decrypt_begin: push ebp push ebx push esi @@ -2215,10 +2221,10 @@ L$011x86: pop ebx pop ebp ret -global _asm_AES_cbc_encrypt +global _aes_nohw_cbc_encrypt align 16 -_asm_AES_cbc_encrypt: -L$_asm_AES_cbc_encrypt_begin: +_aes_nohw_cbc_encrypt: +L$_aes_nohw_cbc_encrypt_begin: push ebp push ebx push esi @@ -2974,16 +2980,16 @@ L$045exit: pop ebx pop ebp ret -global _asm_AES_set_encrypt_key +global _aes_nohw_set_encrypt_key align 16 -_asm_AES_set_encrypt_key: -L$_asm_AES_set_encrypt_key_begin: +_aes_nohw_set_encrypt_key: +L$_aes_nohw_set_encrypt_key_begin: call __x86_AES_set_encrypt_key ret -global _asm_AES_set_decrypt_key +global _aes_nohw_set_decrypt_key align 16 -_asm_AES_set_decrypt_key: -L$_asm_AES_set_decrypt_key_begin: +_aes_nohw_set_decrypt_key: +L$_aes_nohw_set_decrypt_key_begin: call __x86_AES_set_encrypt_key cmp eax,0 je NEAR L$054proceed diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm index a9a595653f..0272fce460 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -15,10 +21,25 @@ section .text code align=64 section .text code %endif ;extern _OPENSSL_ia32cap_P -global _aesni_encrypt +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif +global _aes_hw_encrypt align 16 -_aesni_encrypt: -L$_aesni_encrypt_begin: +_aes_hw_encrypt: +L$_aes_hw_encrypt_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$000pic +L$000pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov eax,DWORD [4+esp] mov edx,DWORD [12+esp] movups xmm2,[eax] @@ -28,22 +49,22 @@ L$_aesni_encrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$000enc1_loop_1: +L$001enc1_loop_1: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$000enc1_loop_1 + jnz NEAR L$001enc1_loop_1 db 102,15,56,221,209 pxor xmm0,xmm0 pxor xmm1,xmm1 movups [eax],xmm2 pxor xmm2,xmm2 ret -global _aesni_decrypt +global _aes_hw_decrypt align 16 -_aesni_decrypt: -L$_aesni_decrypt_begin: +_aes_hw_decrypt: +L$_aes_hw_decrypt_begin: mov eax,DWORD [4+esp] mov edx,DWORD [12+esp] movups xmm2,[eax] @@ -53,12 +74,12 @@ L$_aesni_decrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$001dec1_loop_2: +L$002dec1_loop_2: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$001dec1_loop_2 + jnz NEAR L$002dec1_loop_2 db 102,15,56,223,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -76,7 +97,7 @@ __aesni_encrypt2: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$002enc2_loop: +L$003enc2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] @@ -84,7 +105,7 @@ db 102,15,56,220,217 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$002enc2_loop + jnz NEAR L$003enc2_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,221,208 @@ -101,7 +122,7 @@ __aesni_decrypt2: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$003dec2_loop: +L$004dec2_loop: db 102,15,56,222,209 db 102,15,56,222,217 movups xmm1,[ecx*1+edx] @@ -109,7 +130,7 @@ db 102,15,56,222,217 db 102,15,56,222,208 db 102,15,56,222,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$003dec2_loop + jnz NEAR L$004dec2_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,223,208 @@ -127,7 +148,7 @@ __aesni_encrypt3: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$004enc3_loop: +L$005enc3_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -137,7 +158,7 @@ db 102,15,56,220,208 db 102,15,56,220,216 db 102,15,56,220,224 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$004enc3_loop + jnz NEAR L$005enc3_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -157,7 +178,7 @@ __aesni_decrypt3: lea edx,[32+ecx*1+edx] neg ecx add ecx,16 -L$005dec3_loop: +L$006dec3_loop: db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -167,7 +188,7 @@ db 102,15,56,222,208 db 102,15,56,222,216 db 102,15,56,222,224 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$005dec3_loop + jnz NEAR L$006dec3_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -189,7 +210,7 @@ __aesni_encrypt4: neg ecx db 15,31,64,0 add ecx,16 -L$006enc4_loop: +L$007enc4_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -201,7 +222,7 @@ db 102,15,56,220,216 db 102,15,56,220,224 db 102,15,56,220,232 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$006enc4_loop + jnz NEAR L$007enc4_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -225,7 +246,7 @@ __aesni_decrypt4: neg ecx db 15,31,64,0 add ecx,16 -L$007dec4_loop: +L$008dec4_loop: db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -237,7 +258,7 @@ db 102,15,56,222,216 db 102,15,56,222,224 db 102,15,56,222,232 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$007dec4_loop + jnz NEAR L$008dec4_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -265,13 +286,13 @@ db 102,15,56,220,225 pxor xmm7,xmm0 movups xmm0,[ecx*1+edx] add ecx,16 - jmp NEAR L$008_aesni_encrypt6_inner + jmp NEAR L$009_aesni_encrypt6_inner align 16 -L$009enc6_loop: +L$010enc6_loop: db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 -L$008_aesni_encrypt6_inner: +L$009_aesni_encrypt6_inner: db 102,15,56,220,233 db 102,15,56,220,241 db 102,15,56,220,249 @@ -285,7 +306,7 @@ db 102,15,56,220,232 db 102,15,56,220,240 db 102,15,56,220,248 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$009enc6_loop + jnz NEAR L$010enc6_loop db 102,15,56,220,209 db 102,15,56,220,217 db 102,15,56,220,225 @@ -317,13 +338,13 @@ db 102,15,56,222,225 pxor xmm7,xmm0 movups xmm0,[ecx*1+edx] add ecx,16 - jmp NEAR L$010_aesni_decrypt6_inner + jmp NEAR L$011_aesni_decrypt6_inner align 16 -L$011dec6_loop: +L$012dec6_loop: db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 -L$010_aesni_decrypt6_inner: +L$011_aesni_decrypt6_inner: db 102,15,56,222,233 db 102,15,56,222,241 db 102,15,56,222,249 @@ -337,7 +358,7 @@ db 102,15,56,222,232 db 102,15,56,222,240 db 102,15,56,222,248 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$011dec6_loop + jnz NEAR L$012dec6_loop db 102,15,56,222,209 db 102,15,56,222,217 db 102,15,56,222,225 @@ -351,10 +372,10 @@ db 102,15,56,223,232 db 102,15,56,223,240 db 102,15,56,223,248 ret -global _aesni_ecb_encrypt +global _aes_hw_ecb_encrypt align 16 -_aesni_ecb_encrypt: -L$_aesni_ecb_encrypt_begin: +_aes_hw_ecb_encrypt: +L$_aes_hw_ecb_encrypt_begin: push ebp push ebx push esi @@ -365,14 +386,14 @@ L$_aesni_ecb_encrypt_begin: mov edx,DWORD [32+esp] mov ebx,DWORD [36+esp] and eax,-16 - jz NEAR L$012ecb_ret + jz NEAR L$013ecb_ret mov ecx,DWORD [240+edx] test ebx,ebx - jz NEAR L$013ecb_decrypt + jz NEAR L$014ecb_decrypt mov ebp,edx mov ebx,ecx cmp eax,96 - jb NEAR L$014ecb_enc_tail + jb NEAR L$015ecb_enc_tail movdqu xmm2,[esi] movdqu xmm3,[16+esi] movdqu xmm4,[32+esi] @@ -381,9 +402,9 @@ L$_aesni_ecb_encrypt_begin: movdqu xmm7,[80+esi] lea esi,[96+esi] sub eax,96 - jmp NEAR L$015ecb_enc_loop6_enter + jmp NEAR L$016ecb_enc_loop6_enter align 16 -L$016ecb_enc_loop6: +L$017ecb_enc_loop6: movups [edi],xmm2 movdqu xmm2,[esi] movups [16+edi],xmm3 @@ -398,12 +419,12 @@ L$016ecb_enc_loop6: lea edi,[96+edi] movdqu xmm7,[80+esi] lea esi,[96+esi] -L$015ecb_enc_loop6_enter: +L$016ecb_enc_loop6_enter: call __aesni_encrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc NEAR L$016ecb_enc_loop6 + jnc NEAR L$017ecb_enc_loop6 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 @@ -412,18 +433,18 @@ L$015ecb_enc_loop6_enter: movups [80+edi],xmm7 lea edi,[96+edi] add eax,96 - jz NEAR L$012ecb_ret -L$014ecb_enc_tail: + jz NEAR L$013ecb_ret +L$015ecb_enc_tail: movups xmm2,[esi] cmp eax,32 - jb NEAR L$017ecb_enc_one + jb NEAR L$018ecb_enc_one movups xmm3,[16+esi] - je NEAR L$018ecb_enc_two + je NEAR L$019ecb_enc_two movups xmm4,[32+esi] cmp eax,64 - jb NEAR L$019ecb_enc_three + jb NEAR L$020ecb_enc_three movups xmm5,[48+esi] - je NEAR L$020ecb_enc_four + je NEAR L$021ecb_enc_four movups xmm6,[64+esi] xorps xmm7,xmm7 call __aesni_encrypt6 @@ -432,49 +453,49 @@ L$014ecb_enc_tail: movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$017ecb_enc_one: +L$018ecb_enc_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$021enc1_loop_3: +L$022enc1_loop_3: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$021enc1_loop_3 + jnz NEAR L$022enc1_loop_3 db 102,15,56,221,209 movups [edi],xmm2 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$018ecb_enc_two: +L$019ecb_enc_two: call __aesni_encrypt2 movups [edi],xmm2 movups [16+edi],xmm3 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$019ecb_enc_three: +L$020ecb_enc_three: call __aesni_encrypt3 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$020ecb_enc_four: +L$021ecb_enc_four: call __aesni_encrypt4 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 movups [48+edi],xmm5 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$013ecb_decrypt: +L$014ecb_decrypt: mov ebp,edx mov ebx,ecx cmp eax,96 - jb NEAR L$022ecb_dec_tail + jb NEAR L$023ecb_dec_tail movdqu xmm2,[esi] movdqu xmm3,[16+esi] movdqu xmm4,[32+esi] @@ -483,9 +504,9 @@ L$013ecb_decrypt: movdqu xmm7,[80+esi] lea esi,[96+esi] sub eax,96 - jmp NEAR L$023ecb_dec_loop6_enter + jmp NEAR L$024ecb_dec_loop6_enter align 16 -L$024ecb_dec_loop6: +L$025ecb_dec_loop6: movups [edi],xmm2 movdqu xmm2,[esi] movups [16+edi],xmm3 @@ -500,12 +521,12 @@ L$024ecb_dec_loop6: lea edi,[96+edi] movdqu xmm7,[80+esi] lea esi,[96+esi] -L$023ecb_dec_loop6_enter: +L$024ecb_dec_loop6_enter: call __aesni_decrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc NEAR L$024ecb_dec_loop6 + jnc NEAR L$025ecb_dec_loop6 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 @@ -514,18 +535,18 @@ L$023ecb_dec_loop6_enter: movups [80+edi],xmm7 lea edi,[96+edi] add eax,96 - jz NEAR L$012ecb_ret -L$022ecb_dec_tail: + jz NEAR L$013ecb_ret +L$023ecb_dec_tail: movups xmm2,[esi] cmp eax,32 - jb NEAR L$025ecb_dec_one + jb NEAR L$026ecb_dec_one movups xmm3,[16+esi] - je NEAR L$026ecb_dec_two + je NEAR L$027ecb_dec_two movups xmm4,[32+esi] cmp eax,64 - jb NEAR L$027ecb_dec_three + jb NEAR L$028ecb_dec_three movups xmm5,[48+esi] - je NEAR L$028ecb_dec_four + je NEAR L$029ecb_dec_four movups xmm6,[64+esi] xorps xmm7,xmm7 call __aesni_decrypt6 @@ -534,43 +555,43 @@ L$022ecb_dec_tail: movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$025ecb_dec_one: +L$026ecb_dec_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$029dec1_loop_4: +L$030dec1_loop_4: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$029dec1_loop_4 + jnz NEAR L$030dec1_loop_4 db 102,15,56,223,209 movups [edi],xmm2 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$026ecb_dec_two: +L$027ecb_dec_two: call __aesni_decrypt2 movups [edi],xmm2 movups [16+edi],xmm3 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$027ecb_dec_three: +L$028ecb_dec_three: call __aesni_decrypt3 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 - jmp NEAR L$012ecb_ret + jmp NEAR L$013ecb_ret align 16 -L$028ecb_dec_four: +L$029ecb_dec_four: call __aesni_decrypt4 movups [edi],xmm2 movups [16+edi],xmm3 movups [32+edi],xmm4 movups [48+edi],xmm5 -L$012ecb_ret: +L$013ecb_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -584,10 +605,10 @@ L$012ecb_ret: pop ebx pop ebp ret -global _aesni_ccm64_encrypt_blocks +global _aes_hw_ccm64_encrypt_blocks align 16 -_aesni_ccm64_encrypt_blocks: -L$_aesni_ccm64_encrypt_blocks_begin: +_aes_hw_ccm64_encrypt_blocks: +L$_aes_hw_ccm64_encrypt_blocks_begin: push ebp push ebx push esi @@ -623,7 +644,7 @@ L$_aesni_ccm64_encrypt_blocks_begin: lea edx,[32+ecx*1+edx] sub ebx,ecx db 102,15,56,0,253 -L$030ccm64_enc_outer: +L$031ccm64_enc_outer: movups xmm0,[ebp] mov ecx,ebx movups xmm6,[esi] @@ -632,7 +653,7 @@ L$030ccm64_enc_outer: xorps xmm0,xmm6 xorps xmm3,xmm0 movups xmm0,[32+ebp] -L$031ccm64_enc2_loop: +L$032ccm64_enc2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] @@ -640,7 +661,7 @@ db 102,15,56,220,217 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$031ccm64_enc2_loop + jnz NEAR L$032ccm64_enc2_loop db 102,15,56,220,209 db 102,15,56,220,217 paddq xmm7,[16+esp] @@ -653,7 +674,7 @@ db 102,15,56,221,216 movups [edi],xmm6 db 102,15,56,0,213 lea edi,[16+edi] - jnz NEAR L$030ccm64_enc_outer + jnz NEAR L$031ccm64_enc_outer mov esp,DWORD [48+esp] mov edi,DWORD [40+esp] movups [edi],xmm3 @@ -670,10 +691,10 @@ db 102,15,56,0,213 pop ebx pop ebp ret -global _aesni_ccm64_decrypt_blocks +global _aes_hw_ccm64_decrypt_blocks align 16 -_aesni_ccm64_decrypt_blocks: -L$_aesni_ccm64_decrypt_blocks_begin: +_aes_hw_ccm64_decrypt_blocks: +L$_aes_hw_ccm64_decrypt_blocks_begin: push ebp push ebx push esi @@ -710,12 +731,12 @@ db 102,15,56,0,253 movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$032enc1_loop_5: +L$033enc1_loop_5: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$032enc1_loop_5 + jnz NEAR L$033enc1_loop_5 db 102,15,56,221,209 shl ebx,4 mov ecx,16 @@ -725,16 +746,16 @@ db 102,15,56,221,209 sub ecx,ebx lea edx,[32+ebx*1+ebp] mov ebx,ecx - jmp NEAR L$033ccm64_dec_outer + jmp NEAR L$034ccm64_dec_outer align 16 -L$033ccm64_dec_outer: +L$034ccm64_dec_outer: xorps xmm6,xmm2 movdqa xmm2,xmm7 movups [edi],xmm6 lea edi,[16+edi] db 102,15,56,0,213 sub eax,1 - jz NEAR L$034ccm64_dec_break + jz NEAR L$035ccm64_dec_break movups xmm0,[ebp] mov ecx,ebx movups xmm1,[16+ebp] @@ -742,7 +763,7 @@ db 102,15,56,0,213 xorps xmm2,xmm0 xorps xmm3,xmm6 movups xmm0,[32+ebp] -L$035ccm64_dec2_loop: +L$036ccm64_dec2_loop: db 102,15,56,220,209 db 102,15,56,220,217 movups xmm1,[ecx*1+edx] @@ -750,7 +771,7 @@ db 102,15,56,220,217 db 102,15,56,220,208 db 102,15,56,220,216 movups xmm0,[ecx*1+edx-16] - jnz NEAR L$035ccm64_dec2_loop + jnz NEAR L$036ccm64_dec2_loop movups xmm6,[esi] paddq xmm7,[16+esp] db 102,15,56,220,209 @@ -758,9 +779,9 @@ db 102,15,56,220,217 db 102,15,56,221,208 db 102,15,56,221,216 lea esi,[16+esi] - jmp NEAR L$033ccm64_dec_outer + jmp NEAR L$034ccm64_dec_outer align 16 -L$034ccm64_dec_break: +L$035ccm64_dec_break: mov ecx,DWORD [240+ebp] mov edx,ebp movups xmm0,[edx] @@ -768,12 +789,12 @@ L$034ccm64_dec_break: xorps xmm6,xmm0 lea edx,[32+edx] xorps xmm3,xmm6 -L$036enc1_loop_6: +L$037enc1_loop_6: db 102,15,56,220,217 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$036enc1_loop_6 + jnz NEAR L$037enc1_loop_6 db 102,15,56,221,217 mov esp,DWORD [48+esp] mov edi,DWORD [40+esp] @@ -791,14 +812,26 @@ db 102,15,56,221,217 pop ebx pop ebp ret -global _aesni_ctr32_encrypt_blocks +global _aes_hw_ctr32_encrypt_blocks align 16 -_aesni_ctr32_encrypt_blocks: -L$_aesni_ctr32_encrypt_blocks_begin: +_aes_hw_ctr32_encrypt_blocks: +L$_aes_hw_ctr32_encrypt_blocks_begin: push ebp push ebx push esi push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$038pic +L$038pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] mov eax,DWORD [28+esp] @@ -809,7 +842,7 @@ L$_aesni_ctr32_encrypt_blocks_begin: and esp,-16 mov DWORD [80+esp],ebp cmp eax,1 - je NEAR L$037ctr32_one_shortcut + je NEAR L$039ctr32_one_shortcut movdqu xmm7,[ebx] mov DWORD [esp],202182159 mov DWORD [4+esp],134810123 @@ -847,7 +880,7 @@ db 102,15,56,0,202 pshufd xmm2,xmm0,192 pshufd xmm3,xmm0,128 cmp eax,6 - jb NEAR L$038ctr32_tail + jb NEAR L$040ctr32_tail pxor xmm7,xmm6 shl ecx,4 mov ebx,16 @@ -856,9 +889,9 @@ db 102,15,56,0,202 sub ebx,ecx lea edx,[32+ecx*1+edx] sub eax,6 - jmp NEAR L$039ctr32_loop6 + jmp NEAR L$041ctr32_loop6 align 16 -L$039ctr32_loop6: +L$041ctr32_loop6: pshufd xmm4,xmm0,64 movdqa xmm0,[32+esp] pshufd xmm5,xmm1,192 @@ -912,27 +945,27 @@ db 102,15,56,0,202 lea edi,[96+edi] pshufd xmm3,xmm0,128 sub eax,6 - jnc NEAR L$039ctr32_loop6 + jnc NEAR L$041ctr32_loop6 add eax,6 - jz NEAR L$040ctr32_ret + jz NEAR L$042ctr32_ret movdqu xmm7,[ebp] mov edx,ebp pxor xmm7,[32+esp] mov ecx,DWORD [240+ebp] -L$038ctr32_tail: +L$040ctr32_tail: por xmm2,xmm7 cmp eax,2 - jb NEAR L$041ctr32_one + jb NEAR L$043ctr32_one pshufd xmm4,xmm0,64 por xmm3,xmm7 - je NEAR L$042ctr32_two + je NEAR L$044ctr32_two pshufd xmm5,xmm1,192 por xmm4,xmm7 cmp eax,4 - jb NEAR L$043ctr32_three + jb NEAR L$045ctr32_three pshufd xmm6,xmm1,128 por xmm5,xmm7 - je NEAR L$044ctr32_four + je NEAR L$046ctr32_four por xmm6,xmm7 call __aesni_encrypt6 movups xmm1,[esi] @@ -950,29 +983,29 @@ L$038ctr32_tail: movups [32+edi],xmm4 movups [48+edi],xmm5 movups [64+edi],xmm6 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$037ctr32_one_shortcut: +L$039ctr32_one_shortcut: movups xmm2,[ebx] mov ecx,DWORD [240+edx] -L$041ctr32_one: +L$043ctr32_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$045enc1_loop_7: +L$047enc1_loop_7: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$045enc1_loop_7 + jnz NEAR L$047enc1_loop_7 db 102,15,56,221,209 movups xmm6,[esi] xorps xmm6,xmm2 movups [edi],xmm6 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$042ctr32_two: +L$044ctr32_two: call __aesni_encrypt2 movups xmm5,[esi] movups xmm6,[16+esi] @@ -980,9 +1013,9 @@ L$042ctr32_two: xorps xmm3,xmm6 movups [edi],xmm2 movups [16+edi],xmm3 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$043ctr32_three: +L$045ctr32_three: call __aesni_encrypt3 movups xmm5,[esi] movups xmm6,[16+esi] @@ -993,9 +1026,9 @@ L$043ctr32_three: xorps xmm4,xmm7 movups [16+edi],xmm3 movups [32+edi],xmm4 - jmp NEAR L$040ctr32_ret + jmp NEAR L$042ctr32_ret align 16 -L$044ctr32_four: +L$046ctr32_four: call __aesni_encrypt4 movups xmm6,[esi] movups xmm7,[16+esi] @@ -1009,7 +1042,7 @@ L$044ctr32_four: xorps xmm5,xmm0 movups [32+edi],xmm4 movups [48+edi],xmm5 -L$040ctr32_ret: +L$042ctr32_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -1027,10 +1060,10 @@ L$040ctr32_ret: pop ebx pop ebp ret -global _aesni_xts_encrypt +global _aes_hw_xts_encrypt align 16 -_aesni_xts_encrypt: -L$_aesni_xts_encrypt_begin: +_aes_hw_xts_encrypt: +L$_aes_hw_xts_encrypt_begin: push ebp push ebx push esi @@ -1043,12 +1076,12 @@ L$_aesni_xts_encrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$046enc1_loop_8: +L$048enc1_loop_8: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$046enc1_loop_8 + jnz NEAR L$048enc1_loop_8 db 102,15,56,221,209 mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] @@ -1072,14 +1105,14 @@ db 102,15,56,221,209 mov ebp,edx mov ebx,ecx sub eax,96 - jc NEAR L$047xts_enc_short + jc NEAR L$049xts_enc_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,[32+ecx*1+edx] - jmp NEAR L$048xts_enc_loop6 + jmp NEAR L$050xts_enc_loop6 align 16 -L$048xts_enc_loop6: +L$050xts_enc_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa [esp],xmm1 @@ -1168,23 +1201,23 @@ db 102,15,56,220,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc NEAR L$048xts_enc_loop6 + jnc NEAR L$050xts_enc_loop6 mov ecx,DWORD [240+ebp] mov edx,ebp mov ebx,ecx -L$047xts_enc_short: +L$049xts_enc_short: add eax,96 - jz NEAR L$049xts_enc_done6x + jz NEAR L$051xts_enc_done6x movdqa xmm5,xmm1 cmp eax,32 - jb NEAR L$050xts_enc_one + jb NEAR L$052xts_enc_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je NEAR L$051xts_enc_two + je NEAR L$053xts_enc_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1193,7 +1226,7 @@ L$047xts_enc_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb NEAR L$052xts_enc_three + jb NEAR L$054xts_enc_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1203,7 +1236,7 @@ L$047xts_enc_short: pxor xmm1,xmm2 movdqa [esp],xmm5 movdqa [16+esp],xmm6 - je NEAR L$053xts_enc_four + je NEAR L$055xts_enc_four movdqa [32+esp],xmm7 pshufd xmm7,xmm0,19 movdqa [48+esp],xmm1 @@ -1235,9 +1268,9 @@ L$047xts_enc_short: movups [48+edi],xmm5 movups [64+edi],xmm6 lea edi,[80+edi] - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$050xts_enc_one: +L$052xts_enc_one: movups xmm2,[esi] lea esi,[16+esi] xorps xmm2,xmm5 @@ -1245,20 +1278,20 @@ L$050xts_enc_one: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$055enc1_loop_9: +L$057enc1_loop_9: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$055enc1_loop_9 + jnz NEAR L$057enc1_loop_9 db 102,15,56,221,209 xorps xmm2,xmm5 movups [edi],xmm2 lea edi,[16+edi] movdqa xmm1,xmm5 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$051xts_enc_two: +L$053xts_enc_two: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1272,9 +1305,9 @@ L$051xts_enc_two: movups [16+edi],xmm3 lea edi,[32+edi] movdqa xmm1,xmm6 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$052xts_enc_three: +L$054xts_enc_three: movaps xmm7,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1292,9 +1325,9 @@ L$052xts_enc_three: movups [32+edi],xmm4 lea edi,[48+edi] movdqa xmm1,xmm7 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$053xts_enc_four: +L$055xts_enc_four: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1316,28 +1349,28 @@ L$053xts_enc_four: movups [48+edi],xmm5 lea edi,[64+edi] movdqa xmm1,xmm6 - jmp NEAR L$054xts_enc_done + jmp NEAR L$056xts_enc_done align 16 -L$049xts_enc_done6x: +L$051xts_enc_done6x: mov eax,DWORD [112+esp] and eax,15 - jz NEAR L$056xts_enc_ret + jz NEAR L$058xts_enc_ret movdqa xmm5,xmm1 mov DWORD [112+esp],eax - jmp NEAR L$057xts_enc_steal + jmp NEAR L$059xts_enc_steal align 16 -L$054xts_enc_done: +L$056xts_enc_done: mov eax,DWORD [112+esp] pxor xmm0,xmm0 and eax,15 - jz NEAR L$056xts_enc_ret + jz NEAR L$058xts_enc_ret pcmpgtd xmm0,xmm1 mov DWORD [112+esp],eax pshufd xmm5,xmm0,19 paddq xmm1,xmm1 pand xmm5,[96+esp] pxor xmm5,xmm1 -L$057xts_enc_steal: +L$059xts_enc_steal: movzx ecx,BYTE [esi] movzx edx,BYTE [edi-16] lea esi,[1+esi] @@ -1345,7 +1378,7 @@ L$057xts_enc_steal: mov BYTE [edi],dl lea edi,[1+edi] sub eax,1 - jnz NEAR L$057xts_enc_steal + jnz NEAR L$059xts_enc_steal sub edi,DWORD [112+esp] mov edx,ebp mov ecx,ebx @@ -1355,16 +1388,16 @@ L$057xts_enc_steal: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$058enc1_loop_10: +L$060enc1_loop_10: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$058enc1_loop_10 + jnz NEAR L$060enc1_loop_10 db 102,15,56,221,209 xorps xmm2,xmm5 movups [edi-16],xmm2 -L$056xts_enc_ret: +L$058xts_enc_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -1385,10 +1418,10 @@ L$056xts_enc_ret: pop ebx pop ebp ret -global _aesni_xts_decrypt +global _aes_hw_xts_decrypt align 16 -_aesni_xts_decrypt: -L$_aesni_xts_decrypt_begin: +_aes_hw_xts_decrypt: +L$_aes_hw_xts_decrypt_begin: push ebp push ebx push esi @@ -1401,12 +1434,12 @@ L$_aesni_xts_decrypt_begin: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$059enc1_loop_11: +L$061enc1_loop_11: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$059enc1_loop_11 + jnz NEAR L$061enc1_loop_11 db 102,15,56,221,209 mov esi,DWORD [20+esp] mov edi,DWORD [24+esp] @@ -1435,14 +1468,14 @@ db 102,15,56,221,209 pcmpgtd xmm0,xmm1 and eax,-16 sub eax,96 - jc NEAR L$060xts_dec_short + jc NEAR L$062xts_dec_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,[32+ecx*1+edx] - jmp NEAR L$061xts_dec_loop6 + jmp NEAR L$063xts_dec_loop6 align 16 -L$061xts_dec_loop6: +L$063xts_dec_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa [esp],xmm1 @@ -1531,23 +1564,23 @@ db 102,15,56,222,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc NEAR L$061xts_dec_loop6 + jnc NEAR L$063xts_dec_loop6 mov ecx,DWORD [240+ebp] mov edx,ebp mov ebx,ecx -L$060xts_dec_short: +L$062xts_dec_short: add eax,96 - jz NEAR L$062xts_dec_done6x + jz NEAR L$064xts_dec_done6x movdqa xmm5,xmm1 cmp eax,32 - jb NEAR L$063xts_dec_one + jb NEAR L$065xts_dec_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je NEAR L$064xts_dec_two + je NEAR L$066xts_dec_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1556,7 +1589,7 @@ L$060xts_dec_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb NEAR L$065xts_dec_three + jb NEAR L$067xts_dec_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1566,7 +1599,7 @@ L$060xts_dec_short: pxor xmm1,xmm2 movdqa [esp],xmm5 movdqa [16+esp],xmm6 - je NEAR L$066xts_dec_four + je NEAR L$068xts_dec_four movdqa [32+esp],xmm7 pshufd xmm7,xmm0,19 movdqa [48+esp],xmm1 @@ -1598,9 +1631,9 @@ L$060xts_dec_short: movups [48+edi],xmm5 movups [64+edi],xmm6 lea edi,[80+edi] - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$063xts_dec_one: +L$065xts_dec_one: movups xmm2,[esi] lea esi,[16+esi] xorps xmm2,xmm5 @@ -1608,20 +1641,20 @@ L$063xts_dec_one: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$068dec1_loop_12: +L$070dec1_loop_12: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$068dec1_loop_12 + jnz NEAR L$070dec1_loop_12 db 102,15,56,223,209 xorps xmm2,xmm5 movups [edi],xmm2 lea edi,[16+edi] movdqa xmm1,xmm5 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$064xts_dec_two: +L$066xts_dec_two: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1635,9 +1668,9 @@ L$064xts_dec_two: movups [16+edi],xmm3 lea edi,[32+edi] movdqa xmm1,xmm6 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$065xts_dec_three: +L$067xts_dec_three: movaps xmm7,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1655,9 +1688,9 @@ L$065xts_dec_three: movups [32+edi],xmm4 lea edi,[48+edi] movdqa xmm1,xmm7 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$066xts_dec_four: +L$068xts_dec_four: movaps xmm6,xmm1 movups xmm2,[esi] movups xmm3,[16+esi] @@ -1679,20 +1712,20 @@ L$066xts_dec_four: movups [48+edi],xmm5 lea edi,[64+edi] movdqa xmm1,xmm6 - jmp NEAR L$067xts_dec_done + jmp NEAR L$069xts_dec_done align 16 -L$062xts_dec_done6x: +L$064xts_dec_done6x: mov eax,DWORD [112+esp] and eax,15 - jz NEAR L$069xts_dec_ret + jz NEAR L$071xts_dec_ret mov DWORD [112+esp],eax - jmp NEAR L$070xts_dec_only_one_more + jmp NEAR L$072xts_dec_only_one_more align 16 -L$067xts_dec_done: +L$069xts_dec_done: mov eax,DWORD [112+esp] pxor xmm0,xmm0 and eax,15 - jz NEAR L$069xts_dec_ret + jz NEAR L$071xts_dec_ret pcmpgtd xmm0,xmm1 mov DWORD [112+esp],eax pshufd xmm2,xmm0,19 @@ -1702,7 +1735,7 @@ L$067xts_dec_done: pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 -L$070xts_dec_only_one_more: +L$072xts_dec_only_one_more: pshufd xmm5,xmm0,19 movdqa xmm6,xmm1 paddq xmm1,xmm1 @@ -1716,16 +1749,16 @@ L$070xts_dec_only_one_more: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$071dec1_loop_13: +L$073dec1_loop_13: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$071dec1_loop_13 + jnz NEAR L$073dec1_loop_13 db 102,15,56,223,209 xorps xmm2,xmm5 movups [edi],xmm2 -L$072xts_dec_steal: +L$074xts_dec_steal: movzx ecx,BYTE [16+esi] movzx edx,BYTE [edi] lea esi,[1+esi] @@ -1733,7 +1766,7 @@ L$072xts_dec_steal: mov BYTE [16+edi],dl lea edi,[1+edi] sub eax,1 - jnz NEAR L$072xts_dec_steal + jnz NEAR L$074xts_dec_steal sub edi,DWORD [112+esp] mov edx,ebp mov ecx,ebx @@ -1743,16 +1776,16 @@ L$072xts_dec_steal: movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$073dec1_loop_14: +L$075dec1_loop_14: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$073dec1_loop_14 + jnz NEAR L$075dec1_loop_14 db 102,15,56,223,209 xorps xmm2,xmm6 movups [edi],xmm2 -L$069xts_dec_ret: +L$071xts_dec_ret: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -1773,10 +1806,10 @@ L$069xts_dec_ret: pop ebx pop ebp ret -global _aesni_cbc_encrypt +global _aes_hw_cbc_encrypt align 16 -_aesni_cbc_encrypt: -L$_aesni_cbc_encrypt_begin: +_aes_hw_cbc_encrypt: +L$_aes_hw_cbc_encrypt_begin: push ebp push ebx push esi @@ -1790,7 +1823,7 @@ L$_aesni_cbc_encrypt_begin: mov edx,DWORD [32+esp] mov ebp,DWORD [36+esp] test eax,eax - jz NEAR L$074cbc_abort + jz NEAR L$076cbc_abort cmp DWORD [40+esp],0 xchg ebx,esp movups xmm7,[ebp] @@ -1798,14 +1831,14 @@ L$_aesni_cbc_encrypt_begin: mov ebp,edx mov DWORD [16+esp],ebx mov ebx,ecx - je NEAR L$075cbc_decrypt + je NEAR L$077cbc_decrypt movaps xmm2,xmm7 cmp eax,16 - jb NEAR L$076cbc_enc_tail + jb NEAR L$078cbc_enc_tail sub eax,16 - jmp NEAR L$077cbc_enc_loop + jmp NEAR L$079cbc_enc_loop align 16 -L$077cbc_enc_loop: +L$079cbc_enc_loop: movups xmm7,[esi] lea esi,[16+esi] movups xmm0,[edx] @@ -1813,25 +1846,25 @@ L$077cbc_enc_loop: xorps xmm7,xmm0 lea edx,[32+edx] xorps xmm2,xmm7 -L$078enc1_loop_15: +L$080enc1_loop_15: db 102,15,56,220,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$078enc1_loop_15 + jnz NEAR L$080enc1_loop_15 db 102,15,56,221,209 mov ecx,ebx mov edx,ebp movups [edi],xmm2 lea edi,[16+edi] sub eax,16 - jnc NEAR L$077cbc_enc_loop + jnc NEAR L$079cbc_enc_loop add eax,16 - jnz NEAR L$076cbc_enc_tail + jnz NEAR L$078cbc_enc_tail movaps xmm7,xmm2 pxor xmm2,xmm2 - jmp NEAR L$079cbc_ret -L$076cbc_enc_tail: + jmp NEAR L$081cbc_ret +L$078cbc_enc_tail: mov ecx,eax dd 2767451785 mov ecx,16 @@ -1842,20 +1875,20 @@ dd 2868115081 mov ecx,ebx mov esi,edi mov edx,ebp - jmp NEAR L$077cbc_enc_loop + jmp NEAR L$079cbc_enc_loop align 16 -L$075cbc_decrypt: +L$077cbc_decrypt: cmp eax,80 - jbe NEAR L$080cbc_dec_tail + jbe NEAR L$082cbc_dec_tail movaps [esp],xmm7 sub eax,80 - jmp NEAR L$081cbc_dec_loop6_enter + jmp NEAR L$083cbc_dec_loop6_enter align 16 -L$082cbc_dec_loop6: +L$084cbc_dec_loop6: movaps [esp],xmm0 movups [edi],xmm7 lea edi,[16+edi] -L$081cbc_dec_loop6_enter: +L$083cbc_dec_loop6_enter: movdqu xmm2,[esi] movdqu xmm3,[16+esi] movdqu xmm4,[32+esi] @@ -1885,28 +1918,28 @@ L$081cbc_dec_loop6_enter: movups [64+edi],xmm6 lea edi,[80+edi] sub eax,96 - ja NEAR L$082cbc_dec_loop6 + ja NEAR L$084cbc_dec_loop6 movaps xmm2,xmm7 movaps xmm7,xmm0 add eax,80 - jle NEAR L$083cbc_dec_clear_tail_collected + jle NEAR L$085cbc_dec_clear_tail_collected movups [edi],xmm2 lea edi,[16+edi] -L$080cbc_dec_tail: +L$082cbc_dec_tail: movups xmm2,[esi] movaps xmm6,xmm2 cmp eax,16 - jbe NEAR L$084cbc_dec_one + jbe NEAR L$086cbc_dec_one movups xmm3,[16+esi] movaps xmm5,xmm3 cmp eax,32 - jbe NEAR L$085cbc_dec_two + jbe NEAR L$087cbc_dec_two movups xmm4,[32+esi] cmp eax,48 - jbe NEAR L$086cbc_dec_three + jbe NEAR L$088cbc_dec_three movups xmm5,[48+esi] cmp eax,64 - jbe NEAR L$087cbc_dec_four + jbe NEAR L$089cbc_dec_four movups xmm6,[64+esi] movaps [esp],xmm7 movups xmm2,[esi] @@ -1933,26 +1966,26 @@ L$080cbc_dec_tail: movaps xmm2,xmm6 pxor xmm6,xmm6 sub eax,80 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$084cbc_dec_one: +L$086cbc_dec_one: movups xmm0,[edx] movups xmm1,[16+edx] lea edx,[32+edx] xorps xmm2,xmm0 -L$089dec1_loop_16: +L$091dec1_loop_16: db 102,15,56,222,209 dec ecx movups xmm1,[edx] lea edx,[16+edx] - jnz NEAR L$089dec1_loop_16 + jnz NEAR L$091dec1_loop_16 db 102,15,56,223,209 xorps xmm2,xmm7 movaps xmm7,xmm6 sub eax,16 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$085cbc_dec_two: +L$087cbc_dec_two: call __aesni_decrypt2 xorps xmm2,xmm7 xorps xmm3,xmm6 @@ -1962,9 +1995,9 @@ L$085cbc_dec_two: lea edi,[16+edi] movaps xmm7,xmm5 sub eax,32 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$086cbc_dec_three: +L$088cbc_dec_three: call __aesni_decrypt3 xorps xmm2,xmm7 xorps xmm3,xmm6 @@ -1977,9 +2010,9 @@ L$086cbc_dec_three: lea edi,[32+edi] movups xmm7,[32+esi] sub eax,48 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$087cbc_dec_four: +L$089cbc_dec_four: call __aesni_decrypt4 movups xmm1,[16+esi] movups xmm0,[32+esi] @@ -1997,21 +2030,21 @@ L$087cbc_dec_four: movaps xmm2,xmm5 pxor xmm5,xmm5 sub eax,64 - jmp NEAR L$088cbc_dec_tail_collected + jmp NEAR L$090cbc_dec_tail_collected align 16 -L$083cbc_dec_clear_tail_collected: +L$085cbc_dec_clear_tail_collected: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 pxor xmm6,xmm6 -L$088cbc_dec_tail_collected: +L$090cbc_dec_tail_collected: and eax,15 - jnz NEAR L$090cbc_dec_tail_partial + jnz NEAR L$092cbc_dec_tail_partial movups [edi],xmm2 pxor xmm0,xmm0 - jmp NEAR L$079cbc_ret + jmp NEAR L$081cbc_ret align 16 -L$090cbc_dec_tail_partial: +L$092cbc_dec_tail_partial: movaps [esp],xmm2 pxor xmm0,xmm0 mov ecx,16 @@ -2019,14 +2052,14 @@ L$090cbc_dec_tail_partial: sub ecx,eax dd 2767451785 movdqa [esp],xmm2 -L$079cbc_ret: +L$081cbc_ret: mov esp,DWORD [16+esp] mov ebp,DWORD [36+esp] pxor xmm2,xmm2 pxor xmm1,xmm1 movups [ebp],xmm7 pxor xmm7,xmm7 -L$074cbc_abort: +L$076cbc_abort: pop edi pop esi pop ebx @@ -2037,13 +2070,13 @@ __aesni_set_encrypt_key: push ebp push ebx test eax,eax - jz NEAR L$091bad_pointer + jz NEAR L$093bad_pointer test edx,edx - jz NEAR L$091bad_pointer - call L$092pic -L$092pic: + jz NEAR L$093bad_pointer + call L$094pic +L$094pic: pop ebx - lea ebx,[(L$key_const-L$092pic)+ebx] + lea ebx,[(L$key_const-L$094pic)+ebx] lea ebp,[_OPENSSL_ia32cap_P] movups xmm0,[eax] xorps xmm4,xmm4 @@ -2051,45 +2084,45 @@ L$092pic: lea edx,[16+edx] and ebp,268437504 cmp ecx,256 - je NEAR L$09314rounds + je NEAR L$09514rounds cmp ecx,192 - je NEAR L$09412rounds + je NEAR L$09612rounds cmp ecx,128 - jne NEAR L$095bad_keybits + jne NEAR L$097bad_keybits align 16 -L$09610rounds: +L$09810rounds: cmp ebp,268435456 - je NEAR L$09710rounds_alt + je NEAR L$09910rounds_alt mov ecx,9 movups [edx-16],xmm0 db 102,15,58,223,200,1 - call L$098key_128_cold + call L$100key_128_cold db 102,15,58,223,200,2 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,4 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,8 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,16 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,32 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,64 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,128 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,27 - call L$099key_128 + call L$101key_128 db 102,15,58,223,200,54 - call L$099key_128 + call L$101key_128 movups [edx],xmm0 mov DWORD [80+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$099key_128: +L$101key_128: movups [edx],xmm0 lea edx,[16+edx] -L$098key_128_cold: +L$100key_128_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2098,13 +2131,13 @@ L$098key_128_cold: xorps xmm0,xmm1 ret align 16 -L$09710rounds_alt: +L$09910rounds_alt: movdqa xmm5,[ebx] mov ecx,8 movdqa xmm4,[32+ebx] movdqa xmm2,xmm0 movdqu [edx-16],xmm0 -L$101loop_key128: +L$103loop_key128: db 102,15,56,0,197 db 102,15,56,221,196 pslld xmm4,1 @@ -2120,7 +2153,7 @@ db 102,15,56,221,196 movdqu [edx-16],xmm0 movdqa xmm2,xmm0 dec ecx - jnz NEAR L$101loop_key128 + jnz NEAR L$103loop_key128 movdqa xmm4,[48+ebx] db 102,15,56,0,197 db 102,15,56,221,196 @@ -2148,41 +2181,41 @@ db 102,15,56,221,196 movdqu [16+edx],xmm0 mov ecx,9 mov DWORD [96+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$09412rounds: +L$09612rounds: movq xmm2,[16+eax] cmp ebp,268435456 - je NEAR L$10212rounds_alt + je NEAR L$10412rounds_alt mov ecx,11 movups [edx-16],xmm0 db 102,15,58,223,202,1 - call L$103key_192a_cold + call L$105key_192a_cold db 102,15,58,223,202,2 - call L$104key_192b + call L$106key_192b db 102,15,58,223,202,4 - call L$105key_192a + call L$107key_192a db 102,15,58,223,202,8 - call L$104key_192b + call L$106key_192b db 102,15,58,223,202,16 - call L$105key_192a + call L$107key_192a db 102,15,58,223,202,32 - call L$104key_192b + call L$106key_192b db 102,15,58,223,202,64 - call L$105key_192a + call L$107key_192a db 102,15,58,223,202,128 - call L$104key_192b + call L$106key_192b movups [edx],xmm0 mov DWORD [48+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$105key_192a: +L$107key_192a: movups [edx],xmm0 lea edx,[16+edx] align 16 -L$103key_192a_cold: +L$105key_192a_cold: movaps xmm5,xmm2 -L$106key_192b_warm: +L$108key_192b_warm: shufps xmm4,xmm0,16 movdqa xmm3,xmm2 xorps xmm0,xmm4 @@ -2196,21 +2229,21 @@ L$106key_192b_warm: pxor xmm2,xmm3 ret align 16 -L$104key_192b: +L$106key_192b: movaps xmm3,xmm0 shufps xmm5,xmm0,68 movups [edx],xmm5 shufps xmm3,xmm2,78 movups [16+edx],xmm3 lea edx,[32+edx] - jmp NEAR L$106key_192b_warm + jmp NEAR L$108key_192b_warm align 16 -L$10212rounds_alt: +L$10412rounds_alt: movdqa xmm5,[16+ebx] movdqa xmm4,[32+ebx] mov ecx,8 movdqu [edx-16],xmm0 -L$107loop_key192: +L$109loop_key192: movq [edx],xmm2 movdqa xmm1,xmm2 db 102,15,56,0,213 @@ -2232,54 +2265,54 @@ db 102,15,56,221,212 pxor xmm2,xmm3 movdqu [edx-16],xmm0 dec ecx - jnz NEAR L$107loop_key192 + jnz NEAR L$109loop_key192 mov ecx,11 mov DWORD [32+edx],ecx - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$09314rounds: +L$09514rounds: movups xmm2,[16+eax] lea edx,[16+edx] cmp ebp,268435456 - je NEAR L$10814rounds_alt + je NEAR L$11014rounds_alt mov ecx,13 movups [edx-32],xmm0 movups [edx-16],xmm2 db 102,15,58,223,202,1 - call L$109key_256a_cold + call L$111key_256a_cold db 102,15,58,223,200,1 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,2 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,2 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,4 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,4 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,8 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,8 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,16 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,16 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,32 - call L$111key_256a + call L$113key_256a db 102,15,58,223,200,32 - call L$110key_256b + call L$112key_256b db 102,15,58,223,202,64 - call L$111key_256a + call L$113key_256a movups [edx],xmm0 mov DWORD [16+edx],ecx xor eax,eax - jmp NEAR L$100good_key + jmp NEAR L$102good_key align 16 -L$111key_256a: +L$113key_256a: movups [edx],xmm2 lea edx,[16+edx] -L$109key_256a_cold: +L$111key_256a_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2288,7 +2321,7 @@ L$109key_256a_cold: xorps xmm0,xmm1 ret align 16 -L$110key_256b: +L$112key_256b: movups [edx],xmm0 lea edx,[16+edx] shufps xmm4,xmm2,16 @@ -2299,14 +2332,14 @@ L$110key_256b: xorps xmm2,xmm1 ret align 16 -L$10814rounds_alt: +L$11014rounds_alt: movdqa xmm5,[ebx] movdqa xmm4,[32+ebx] mov ecx,7 movdqu [edx-32],xmm0 movdqa xmm1,xmm2 movdqu [edx-16],xmm2 -L$112loop_key256: +L$114loop_key256: db 102,15,56,0,213 db 102,15,56,221,212 movdqa xmm3,xmm0 @@ -2320,7 +2353,7 @@ db 102,15,56,221,212 pxor xmm0,xmm2 movdqu [edx],xmm0 dec ecx - jz NEAR L$113done_key256 + jz NEAR L$115done_key256 pshufd xmm2,xmm0,255 pxor xmm3,xmm3 db 102,15,56,221,211 @@ -2335,11 +2368,11 @@ db 102,15,56,221,211 movdqu [16+edx],xmm2 lea edx,[32+edx] movdqa xmm1,xmm2 - jmp NEAR L$112loop_key256 -L$113done_key256: + jmp NEAR L$114loop_key256 +L$115done_key256: mov ecx,13 mov DWORD [16+edx],ecx -L$100good_key: +L$102good_key: pxor xmm0,xmm0 pxor xmm1,xmm1 pxor xmm2,xmm2 @@ -2351,31 +2384,43 @@ L$100good_key: pop ebp ret align 4 -L$091bad_pointer: +L$093bad_pointer: mov eax,-1 pop ebx pop ebp ret align 4 -L$095bad_keybits: +L$097bad_keybits: pxor xmm0,xmm0 mov eax,-2 pop ebx pop ebp ret -global _aesni_set_encrypt_key +global _aes_hw_set_encrypt_key align 16 -_aesni_set_encrypt_key: -L$_aesni_set_encrypt_key_begin: +_aes_hw_set_encrypt_key: +L$_aes_hw_set_encrypt_key_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$116pic +L$116pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+3-L$116pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov eax,DWORD [4+esp] mov ecx,DWORD [8+esp] mov edx,DWORD [12+esp] call __aesni_set_encrypt_key ret -global _aesni_set_decrypt_key +global _aes_hw_set_decrypt_key align 16 -_aesni_set_decrypt_key: -L$_aesni_set_decrypt_key_begin: +_aes_hw_set_decrypt_key: +L$_aes_hw_set_decrypt_key_begin: mov eax,DWORD [4+esp] mov ecx,DWORD [8+esp] mov edx,DWORD [12+esp] @@ -2383,7 +2428,7 @@ L$_aesni_set_decrypt_key_begin: mov edx,DWORD [12+esp] shl ecx,4 test eax,eax - jnz NEAR L$114dec_key_ret + jnz NEAR L$117dec_key_ret lea eax,[16+ecx*1+edx] movups xmm0,[edx] movups xmm1,[eax] @@ -2391,7 +2436,7 @@ L$_aesni_set_decrypt_key_begin: movups [edx],xmm1 lea edx,[16+edx] lea eax,[eax-16] -L$115dec_key_inverse: +L$118dec_key_inverse: movups xmm0,[edx] movups xmm1,[eax] db 102,15,56,219,192 @@ -2401,14 +2446,14 @@ db 102,15,56,219,201 movups [16+eax],xmm0 movups [edx-16],xmm1 cmp eax,edx - ja NEAR L$115dec_key_inverse + ja NEAR L$118dec_key_inverse movups xmm0,[edx] db 102,15,56,219,192 movups [edx],xmm0 pxor xmm0,xmm0 pxor xmm1,xmm1 xor eax,eax -L$114dec_key_ret: +L$117dec_key_ret: ret align 64 L$key_const: diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm index b222040aca..a87f86d12f 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm index 5780dc841b..b6784bf928 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm new file mode 100644 index 0000000000..1d07be0aea --- /dev/null +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm @@ -0,0 +1,300 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01010000h +%error yasm version 1.1.0 or later needed. +%endif +; Yasm automatically includes .00 and complains about redefining it. +; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html +%else +$@feat.00 equ 1 +%endif +section .text code align=64 +%else +section .text code +%endif +global _gcm_gmult_ssse3 +align 16 +_gcm_gmult_ssse3: +L$_gcm_gmult_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + movdqu xmm0,[edi] + call L$000pic_point +L$000pic_point: + pop eax + movdqa xmm7,[(L$reverse_bytes-L$000pic_point)+eax] + movdqa xmm2,[(L$low4_mask-L$000pic_point)+eax] +db 102,15,56,0,199 + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov eax,5 +L$001loop_row_1: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$001loop_row_1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,5 +L$002loop_row_2: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$002loop_row_2 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,6 +L$003loop_row_3: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$003loop_row_3 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 +db 102,15,56,0,215 + movdqu [edi],xmm2 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pop edi + pop esi + pop ebx + pop ebp + ret +global _gcm_ghash_ssse3 +align 16 +_gcm_ghash_ssse3: +L$_gcm_ghash_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov ecx,DWORD [32+esp] + movdqu xmm0,[edi] + call L$004pic_point +L$004pic_point: + pop ebx + movdqa xmm7,[(L$reverse_bytes-L$004pic_point)+ebx] + and ecx,-16 +db 102,15,56,0,199 + pxor xmm3,xmm3 +L$005loop_ghash: + movdqa xmm2,[(L$low4_mask-L$004pic_point)+ebx] + movdqu xmm1,[edx] +db 102,15,56,0,207 + pxor xmm0,xmm1 + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + pxor xmm2,xmm2 + mov eax,5 +L$006loop_row_4: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$006loop_row_4 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,5 +L$007loop_row_5: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$007loop_row_5 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,6 +L$008loop_row_6: + movdqa xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 +db 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 +db 102,15,56,0,224 +db 102,15,56,0,233 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$008loop_row_6 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + movdqa xmm0,xmm2 + lea esi,[esi-256] + lea edx,[16+edx] + sub ecx,16 + jnz NEAR L$005loop_ghash +db 102,15,56,0,199 + movdqu [edi],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +L$reverse_bytes: +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +align 16 +L$low4_mask: +dd 252645135,252645135,252645135,252645135 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm index 1d350d6a7f..753c472f04 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm index 67ee21651f..c051923082 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm index cee8c62626..0afe894e52 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm index 3e7cfcca07..b5dc26ba71 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm index 88ed0b380d..3e6b0680bc 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm index b08b05637d..81b8b8330f 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -14,6 +20,9 @@ section .text code align=64 %else section .text code %endif +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif align 64 L$_vpaes_consts: dd 218628480,235210255,168496130,67568393 @@ -465,6 +474,18 @@ L$_vpaes_set_encrypt_key_begin: push ebx push esi push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$016pic +L$016pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif mov esi,DWORD [20+esp] lea ebx,[esp-56] mov eax,DWORD [24+esp] @@ -478,9 +499,9 @@ L$_vpaes_set_encrypt_key_begin: mov DWORD [240+edx],ebx mov ecx,48 mov edi,0 - lea ebp,[(L$_vpaes_consts+0x30-L$016pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)] call __vpaes_schedule_core -L$016pic_point: +L$017pic_point: mov esp,DWORD [48+esp] xor eax,eax pop edi @@ -514,9 +535,9 @@ L$_vpaes_set_decrypt_key_begin: shr ecx,1 and ecx,32 xor ecx,32 - lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)] call __vpaes_schedule_core -L$017pic_point: +L$018pic_point: mov esp,DWORD [48+esp] xor eax,eax pop edi @@ -532,9 +553,21 @@ L$_vpaes_encrypt_begin: push ebx push esi push edi - lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)] +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$019pic +L$019pic: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + lea ebp,[(L$_vpaes_consts+0x30-L$020pic_point)] call __vpaes_preheat -L$018pic_point: +L$020pic_point: mov esi,DWORD [20+esp] lea ebx,[esp-56] mov edi,DWORD [24+esp] @@ -559,9 +592,9 @@ L$_vpaes_decrypt_begin: push ebx push esi push edi - lea ebp,[(L$_vpaes_consts+0x30-L$019pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)] call __vpaes_preheat -L$019pic_point: +L$021pic_point: mov esi,DWORD [20+esp] lea ebx,[esp-56] mov edi,DWORD [24+esp] @@ -591,7 +624,7 @@ L$_vpaes_cbc_encrypt_begin: mov eax,DWORD [28+esp] mov edx,DWORD [32+esp] sub eax,16 - jc NEAR L$020cbc_abort + jc NEAR L$022cbc_abort lea ebx,[esp-56] mov ebp,DWORD [36+esp] and ebx,-16 @@ -604,14 +637,14 @@ L$_vpaes_cbc_encrypt_begin: mov DWORD [4+esp],edx mov DWORD [8+esp],ebp mov edi,eax - lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)] + lea ebp,[(L$_vpaes_consts+0x30-L$023pic_point)] call __vpaes_preheat -L$021pic_point: +L$023pic_point: cmp ecx,0 - je NEAR L$022cbc_dec_loop - jmp NEAR L$023cbc_enc_loop + je NEAR L$024cbc_dec_loop + jmp NEAR L$025cbc_enc_loop align 16 -L$023cbc_enc_loop: +L$025cbc_enc_loop: movdqu xmm0,[esi] pxor xmm0,xmm1 call __vpaes_encrypt_core @@ -621,10 +654,10 @@ L$023cbc_enc_loop: movdqu [esi*1+ebx],xmm0 lea esi,[16+esi] sub edi,16 - jnc NEAR L$023cbc_enc_loop - jmp NEAR L$024cbc_done + jnc NEAR L$025cbc_enc_loop + jmp NEAR L$026cbc_done align 16 -L$022cbc_dec_loop: +L$024cbc_dec_loop: movdqu xmm0,[esi] movdqa [16+esp],xmm1 movdqa [32+esp],xmm0 @@ -636,12 +669,12 @@ L$022cbc_dec_loop: movdqu [esi*1+ebx],xmm0 lea esi,[16+esi] sub edi,16 - jnc NEAR L$022cbc_dec_loop -L$024cbc_done: + jnc NEAR L$024cbc_dec_loop +L$026cbc_done: mov ebx,DWORD [8+esp] mov esp,DWORD [48+esp] movdqu [ebx],xmm1 -L$020cbc_abort: +L$022cbc_abort: pop edi pop esi pop ebx diff --git a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm index b1a4d59429..6a15ed944b 100644 --- a/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm @@ -1,3 +1,9 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 @@ -456,16 +462,18 @@ L$016sub: lea edx,[1+edx] jge NEAR L$016sub sbb eax,0 - and esi,eax - not eax - mov ebp,edi - and ebp,eax - or esi,ebp + mov edx,-1 + xor edx,eax + jmp NEAR L$017copy align 16 L$017copy: - mov eax,DWORD [ebx*4+esi] - mov DWORD [ebx*4+edi],eax + mov esi,DWORD [32+ebx*4+esp] + mov ebp,DWORD [ebx*4+edi] mov DWORD [32+ebx*4+esp],ecx + and esi,eax + and ebp,edx + or ebp,esi + mov DWORD [ebx*4+edi],ebp dec ebx jge NEAR L$017copy mov esp,DWORD [24+esp] diff --git a/packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm b/packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm new file mode 100644 index 0000000000..e5c7d3f7fa --- /dev/null +++ b/packager/third_party/boringssl/win-x86/crypto/test/trampoline-x86.asm @@ -0,0 +1,164 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01010000h +%error yasm version 1.1.0 or later needed. +%endif +; Yasm automatically includes .00 and complains about redefining it. +; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html +%else +$@feat.00 equ 1 +%endif +section .text code align=64 +%else +section .text code +%endif +global _abi_test_trampoline +align 16 +_abi_test_trampoline: +L$_abi_test_trampoline_begin: + push ebp + push ebx + push esi + push edi + mov ecx,DWORD [24+esp] + mov esi,DWORD [ecx] + mov edi,DWORD [4+ecx] + mov ebx,DWORD [8+ecx] + mov ebp,DWORD [12+ecx] + sub esp,44 + mov eax,DWORD [72+esp] + xor ecx,ecx +L$000loop: + cmp ecx,DWORD [76+esp] + jae NEAR L$001loop_done + mov edx,DWORD [ecx*4+eax] + mov DWORD [ecx*4+esp],edx + add ecx,1 + jmp NEAR L$000loop +L$001loop_done: + call DWORD [64+esp] + add esp,44 + mov ecx,DWORD [24+esp] + mov DWORD [ecx],esi + mov DWORD [4+ecx],edi + mov DWORD [8+ecx],ebx + mov DWORD [12+ecx],ebp + pop edi + pop esi + pop ebx + pop ebp + ret +global _abi_test_get_and_clear_direction_flag +align 16 +_abi_test_get_and_clear_direction_flag: +L$_abi_test_get_and_clear_direction_flag_begin: + pushfd + pop eax + and eax,1024 + shr eax,10 + cld + ret +global _abi_test_set_direction_flag +align 16 +_abi_test_set_direction_flag: +L$_abi_test_set_direction_flag_begin: + std + ret +global _abi_test_clobber_eax +align 16 +_abi_test_clobber_eax: +L$_abi_test_clobber_eax_begin: + xor eax,eax + ret +global _abi_test_clobber_ebx +align 16 +_abi_test_clobber_ebx: +L$_abi_test_clobber_ebx_begin: + xor ebx,ebx + ret +global _abi_test_clobber_ecx +align 16 +_abi_test_clobber_ecx: +L$_abi_test_clobber_ecx_begin: + xor ecx,ecx + ret +global _abi_test_clobber_edx +align 16 +_abi_test_clobber_edx: +L$_abi_test_clobber_edx_begin: + xor edx,edx + ret +global _abi_test_clobber_edi +align 16 +_abi_test_clobber_edi: +L$_abi_test_clobber_edi_begin: + xor edi,edi + ret +global _abi_test_clobber_esi +align 16 +_abi_test_clobber_esi: +L$_abi_test_clobber_esi_begin: + xor esi,esi + ret +global _abi_test_clobber_ebp +align 16 +_abi_test_clobber_ebp: +L$_abi_test_clobber_ebp_begin: + xor ebp,ebp + ret +global _abi_test_clobber_xmm0 +align 16 +_abi_test_clobber_xmm0: +L$_abi_test_clobber_xmm0_begin: + pxor xmm0,xmm0 + ret +global _abi_test_clobber_xmm1 +align 16 +_abi_test_clobber_xmm1: +L$_abi_test_clobber_xmm1_begin: + pxor xmm1,xmm1 + ret +global _abi_test_clobber_xmm2 +align 16 +_abi_test_clobber_xmm2: +L$_abi_test_clobber_xmm2_begin: + pxor xmm2,xmm2 + ret +global _abi_test_clobber_xmm3 +align 16 +_abi_test_clobber_xmm3: +L$_abi_test_clobber_xmm3_begin: + pxor xmm3,xmm3 + ret +global _abi_test_clobber_xmm4 +align 16 +_abi_test_clobber_xmm4: +L$_abi_test_clobber_xmm4_begin: + pxor xmm4,xmm4 + ret +global _abi_test_clobber_xmm5 +align 16 +_abi_test_clobber_xmm5: +L$_abi_test_clobber_xmm5_begin: + pxor xmm5,xmm5 + ret +global _abi_test_clobber_xmm6 +align 16 +_abi_test_clobber_xmm6: +L$_abi_test_clobber_xmm6_begin: + pxor xmm6,xmm6 + ret +global _abi_test_clobber_xmm7 +align 16 +_abi_test_clobber_xmm7: +L$_abi_test_clobber_xmm7_begin: + pxor xmm7,xmm7 + ret diff --git a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm index cb36246891..a3c29381e3 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -55,6 +62,7 @@ $L$SEH_begin_ChaCha20_ctr32: mov r8,QWORD[40+rsp] + cmp rdx,0 je NEAR $L$no_data mov r10,QWORD[((OPENSSL_ia32cap_P+4))] @@ -62,12 +70,19 @@ $L$SEH_begin_ChaCha20_ctr32: jnz NEAR $L$ChaCha20_ssse3 push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,64+24 + $L$ctr32_body: @@ -308,16 +323,24 @@ $L$oop_tail: $L$done: lea rsi,[((64+24+48))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$no_data: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_ctr32: ALIGN 32 @@ -334,7 +357,9 @@ $L$SEH_begin_ChaCha20_ssse3: $L$ChaCha20_ssse3: + mov r9,rsp + cmp rdx,128 ja NEAR $L$ChaCha20_4x @@ -465,10 +490,12 @@ $L$done_ssse3: movaps xmm6,XMMWORD[((-40))+r9] movaps xmm7,XMMWORD[((-24))+r9] lea rsp,[r9] + $L$ssse3_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_ssse3: ALIGN 32 @@ -485,7 +512,9 @@ $L$SEH_begin_ChaCha20_4x: $L$ChaCha20_4x: + mov r9,rsp + mov r11,r10 shr r10,32 test r10,32 @@ -1047,10 +1076,12 @@ $L$done4x: movaps xmm14,XMMWORD[((-40))+r9] movaps xmm15,XMMWORD[((-24))+r9] lea rsp,[r9] + $L$4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_4x: ALIGN 32 @@ -1067,7 +1098,9 @@ $L$SEH_begin_ChaCha20_8x: $L$ChaCha20_8x: + mov r9,rsp + sub rsp,0x280+168 and rsp,-32 movaps XMMWORD[(-168)+r9],xmm6 @@ -1683,10 +1716,12 @@ $L$done8x: movaps xmm14,XMMWORD[((-40))+r9] movaps xmm15,XMMWORD[((-24))+r9] lea rsp,[r9] + $L$8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ChaCha20_8x: EXTERN __imp_RtlVirtualUnwind diff --git a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm index 56dc2060a4..e711826b14 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .data data align=8 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm index ab8cf92b72..b1159ae098 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm @@ -1,7 +1,16 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + global dummy_chacha20_poly1305_asm dummy_chacha20_poly1305_asm: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm index f6a4edfa0f..329185ee67 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -160,6 +167,7 @@ DB 0xf3,0xc3 ALIGN 16 _x86_64_AES_encrypt_compact: + lea r8,[128+r14] mov edi,DWORD[((0-128))+r8] mov ebp,DWORD[((32-128))+r8] @@ -330,29 +338,38 @@ $L$enc_compact_done: xor edx,DWORD[12+r15] DB 0xf3,0xc3 + ALIGN 16 -global asm_AES_encrypt +global aes_nohw_encrypt -asm_AES_encrypt: +aes_nohw_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_encrypt: +$L$SEH_begin_aes_nohw_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -363,6 +380,7 @@ $L$SEH_begin_asm_AES_encrypt: mov QWORD[16+rsp],rsi mov QWORD[24+rsp],rax + $L$enc_prologue: mov r15,rdx @@ -389,23 +407,32 @@ $L$enc_prologue: mov r9,QWORD[16+rsp] mov rsi,QWORD[24+rsp] + mov DWORD[r9],eax mov DWORD[4+r9],ebx mov DWORD[8+r9],ecx mov DWORD[12+r9],edx mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_encrypt: + +$L$SEH_end_aes_nohw_encrypt: ALIGN 16 _x86_64_AES_decrypt: @@ -564,6 +591,7 @@ DB 0xf3,0xc3 ALIGN 16 _x86_64_AES_decrypt_compact: + lea r8,[128+r14] mov edi,DWORD[((0-128))+r8] mov ebp,DWORD[((32-128))+r8] @@ -786,29 +814,38 @@ $L$dec_compact_done: xor edx,DWORD[12+r15] DB 0xf3,0xc3 + ALIGN 16 -global asm_AES_decrypt +global aes_nohw_decrypt -asm_AES_decrypt: +aes_nohw_decrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_decrypt: +$L$SEH_begin_aes_nohw_decrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -819,6 +856,7 @@ $L$SEH_begin_asm_AES_decrypt: mov QWORD[16+rsp],rsi mov QWORD[24+rsp],rax + $L$dec_prologue: mov r15,rdx @@ -847,59 +885,81 @@ $L$dec_prologue: mov r9,QWORD[16+rsp] mov rsi,QWORD[24+rsp] + mov DWORD[r9],eax mov DWORD[4+r9],ebx mov DWORD[8+r9],ecx mov DWORD[12+r9],edx mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_decrypt: -ALIGN 16 -global asm_AES_set_encrypt_key -asm_AES_set_encrypt_key: +$L$SEH_end_aes_nohw_decrypt: +ALIGN 16 +global aes_nohw_set_encrypt_key + +aes_nohw_set_encrypt_key: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_set_encrypt_key: +$L$SEH_begin_aes_nohw_set_encrypt_key: mov rdi,rcx mov rsi,rdx mov rdx,r8 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,8 + $L$enc_key_prologue: call _x86_64_AES_set_encrypt_key mov rbp,QWORD[40+rsp] + mov rbx,QWORD[48+rsp] + add rsp,56 + $L$enc_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_set_encrypt_key: + +$L$SEH_end_aes_nohw_set_encrypt_key: ALIGN 16 _x86_64_AES_set_encrypt_key: + mov ecx,esi mov rsi,rdi mov rdi,rdx @@ -1136,26 +1196,35 @@ $L$badpointer: $L$exit: DB 0xf3,0xc3 -ALIGN 16 -global asm_AES_set_decrypt_key -asm_AES_set_decrypt_key: +ALIGN 16 +global aes_nohw_set_decrypt_key + +aes_nohw_set_decrypt_key: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_set_decrypt_key: +$L$SEH_begin_aes_nohw_set_decrypt_key: mov rdi,rcx mov rsi,rdx mov rdx,r8 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + push rdx + $L$dec_key_prologue: call _x86_64_AES_set_encrypt_key @@ -1323,27 +1392,35 @@ $L$permute: xor rax,rax $L$abort: mov r15,QWORD[8+rsp] + mov r14,QWORD[16+rsp] + mov r13,QWORD[24+rsp] + mov r12,QWORD[32+rsp] + mov rbp,QWORD[40+rsp] + mov rbx,QWORD[48+rsp] + add rsp,56 + $L$dec_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_set_decrypt_key: + +$L$SEH_end_aes_nohw_set_decrypt_key: ALIGN 16 -global asm_AES_cbc_encrypt +global aes_nohw_cbc_encrypt EXTERN OPENSSL_ia32cap_P -asm_AES_cbc_encrypt: +aes_nohw_cbc_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_asm_AES_cbc_encrypt: +$L$SEH_begin_aes_nohw_cbc_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1352,15 +1429,25 @@ $L$SEH_begin_asm_AES_cbc_encrypt: mov r9,QWORD[48+rsp] + cmp rdx,0 je NEAR $L$cbc_epilogue pushfq + + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + $L$cbc_prologue: cld @@ -1371,6 +1458,7 @@ $L$cbc_prologue: cmp r9,0 cmove r14,r10 + lea r10,[OPENSSL_ia32cap_P] mov r10d,DWORD[r10] cmp rdx,512 @@ -1407,7 +1495,9 @@ $L$cbc_te_ok: xchg r15,rsp + mov QWORD[16+rsp],r15 + $L$cbc_fast_body: mov QWORD[24+rsp],rdi mov QWORD[32+rsp],rsi @@ -1606,6 +1696,7 @@ $L$cbc_fast_cleanup: ALIGN 16 $L$cbc_slow_prologue: + lea rbp,[((-88))+rsp] and rbp,-64 @@ -1617,7 +1708,9 @@ $L$cbc_slow_prologue: xchg rbp,rsp + mov QWORD[16+rsp],rbp + $L$cbc_slow_body: @@ -1789,20 +1882,32 @@ $L$cbc_slow_dec_partial: ALIGN 16 $L$cbc_exit: mov rsi,QWORD[16+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] + $L$cbc_popfq: popfq + + + $L$cbc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_asm_AES_cbc_encrypt: + +$L$SEH_end_aes_nohw_cbc_encrypt: ALIGN 64 $L$AES_Te: DD 0xa56363c6,0xa56363c6 @@ -2814,44 +2919,44 @@ $L$common_seh_exit: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_asm_AES_encrypt wrt ..imagebase - DD $L$SEH_end_asm_AES_encrypt wrt ..imagebase - DD $L$SEH_info_asm_AES_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_nohw_encrypt wrt ..imagebase + DD $L$SEH_end_aes_nohw_encrypt wrt ..imagebase + DD $L$SEH_info_aes_nohw_encrypt wrt ..imagebase - DD $L$SEH_begin_asm_AES_decrypt wrt ..imagebase - DD $L$SEH_end_asm_AES_decrypt wrt ..imagebase - DD $L$SEH_info_asm_AES_decrypt wrt ..imagebase + DD $L$SEH_begin_aes_nohw_decrypt wrt ..imagebase + DD $L$SEH_end_aes_nohw_decrypt wrt ..imagebase + DD $L$SEH_info_aes_nohw_decrypt wrt ..imagebase - DD $L$SEH_begin_asm_AES_set_encrypt_key wrt ..imagebase - DD $L$SEH_end_asm_AES_set_encrypt_key wrt ..imagebase - DD $L$SEH_info_asm_AES_set_encrypt_key wrt ..imagebase + DD $L$SEH_begin_aes_nohw_set_encrypt_key wrt ..imagebase + DD $L$SEH_end_aes_nohw_set_encrypt_key wrt ..imagebase + DD $L$SEH_info_aes_nohw_set_encrypt_key wrt ..imagebase - DD $L$SEH_begin_asm_AES_set_decrypt_key wrt ..imagebase - DD $L$SEH_end_asm_AES_set_decrypt_key wrt ..imagebase - DD $L$SEH_info_asm_AES_set_decrypt_key wrt ..imagebase + DD $L$SEH_begin_aes_nohw_set_decrypt_key wrt ..imagebase + DD $L$SEH_end_aes_nohw_set_decrypt_key wrt ..imagebase + DD $L$SEH_info_aes_nohw_set_decrypt_key wrt ..imagebase - DD $L$SEH_begin_asm_AES_cbc_encrypt wrt ..imagebase - DD $L$SEH_end_asm_AES_cbc_encrypt wrt ..imagebase - DD $L$SEH_info_asm_AES_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_nohw_cbc_encrypt wrt ..imagebase + DD $L$SEH_end_aes_nohw_cbc_encrypt wrt ..imagebase + DD $L$SEH_info_aes_nohw_cbc_encrypt wrt ..imagebase section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_asm_AES_encrypt: +$L$SEH_info_aes_nohw_encrypt: DB 9,0,0,0 DD block_se_handler wrt ..imagebase DD $L$enc_prologue wrt ..imagebase,$L$enc_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_decrypt: +$L$SEH_info_aes_nohw_decrypt: DB 9,0,0,0 DD block_se_handler wrt ..imagebase DD $L$dec_prologue wrt ..imagebase,$L$dec_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_set_encrypt_key: +$L$SEH_info_aes_nohw_set_encrypt_key: DB 9,0,0,0 DD key_se_handler wrt ..imagebase DD $L$enc_key_prologue wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_set_decrypt_key: +$L$SEH_info_aes_nohw_set_decrypt_key: DB 9,0,0,0 DD key_se_handler wrt ..imagebase DD $L$dec_key_prologue wrt ..imagebase,$L$dec_key_epilogue wrt ..imagebase -$L$SEH_info_asm_AES_cbc_encrypt: +$L$SEH_info_aes_nohw_cbc_encrypt: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm index 63bcd48dcb..2b51a26849 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -594,6 +601,10 @@ $L$SEH_begin_aesni_gcm_encrypt: +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+2))],1 +%endif xor r10,r10 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm index 13e9c5e5b6..342c1523ee 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm @@ -1,14 +1,26 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P -global aesni_encrypt +global aes_hw_encrypt ALIGN 16 -aesni_encrypt: +aes_hw_encrypt: + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+1))],1 +%endif movups xmm2,XMMWORD[rcx] mov eax,DWORD[240+r8] movups xmm0,XMMWORD[r8] @@ -29,10 +41,12 @@ DB 102,15,56,221,209 DB 0F3h,0C3h ;repret -global aesni_decrypt + +global aes_hw_decrypt ALIGN 16 -aesni_decrypt: +aes_hw_decrypt: + movups xmm2,XMMWORD[rcx] mov eax,DWORD[240+r8] movups xmm0,XMMWORD[r8] @@ -53,8 +67,10 @@ DB 102,15,56,223,209 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt2: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -82,8 +98,10 @@ DB 102,15,56,221,216 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt2: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -111,8 +129,10 @@ DB 102,15,56,223,216 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt3: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -145,8 +165,10 @@ DB 102,15,56,221,224 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt3: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -179,8 +201,10 @@ DB 102,15,56,223,224 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt4: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -219,8 +243,10 @@ DB 102,15,56,221,232 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt4: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -259,8 +285,10 @@ DB 102,15,56,223,232 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt6: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -313,8 +341,10 @@ DB 102,15,56,221,248 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt6: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -367,8 +397,10 @@ DB 102,15,56,223,248 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_encrypt8: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -431,8 +463,10 @@ DB 102,68,15,56,221,200 DB 0F3h,0C3h ;repret + ALIGN 16 _aesni_decrypt8: + movups xmm0,XMMWORD[rcx] shl eax,4 movups xmm1,XMMWORD[16+rcx] @@ -494,14 +528,15 @@ DB 102,68,15,56,223,192 DB 102,68,15,56,223,200 DB 0F3h,0C3h ;repret -global aesni_ecb_encrypt + +global aes_hw_ecb_encrypt ALIGN 16 -aesni_ecb_encrypt: +aes_hw_ecb_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_aesni_ecb_encrypt: +$L$SEH_begin_aes_hw_ecb_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -509,6 +544,7 @@ $L$SEH_begin_aesni_ecb_encrypt: mov r8,QWORD[40+rsp] + lea rsp,[((-88))+rsp] movaps XMMWORD[rsp],xmm6 movaps XMMWORD[16+rsp],xmm7 @@ -864,235 +900,16 @@ $L$ecb_enc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ecb_encrypt: -global aesni_ccm64_encrypt_blocks + +$L$SEH_end_aes_hw_ecb_encrypt: +global aes_hw_ctr32_encrypt_blocks ALIGN 16 -aesni_ccm64_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_aesni_ccm64_encrypt_blocks: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rsp,[((-88))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 -$L$ccm64_enc_body: - mov eax,DWORD[240+rcx] - movdqu xmm6,XMMWORD[r8] - movdqa xmm9,XMMWORD[$L$increment64] - movdqa xmm7,XMMWORD[$L$bswap_mask] - - shl eax,4 - mov r10d,16 - lea r11,[rcx] - movdqu xmm3,XMMWORD[r9] - movdqa xmm2,xmm6 - lea rcx,[32+rax*1+rcx] -DB 102,15,56,0,247 - sub r10,rax - jmp NEAR $L$ccm64_enc_outer -ALIGN 16 -$L$ccm64_enc_outer: - movups xmm0,XMMWORD[r11] - mov rax,r10 - movups xmm8,XMMWORD[rdi] - - xorps xmm2,xmm0 - movups xmm1,XMMWORD[16+r11] - xorps xmm0,xmm8 - xorps xmm3,xmm0 - movups xmm0,XMMWORD[32+r11] - -$L$ccm64_enc2_loop: -DB 102,15,56,220,209 -DB 102,15,56,220,217 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 -DB 102,15,56,220,208 -DB 102,15,56,220,216 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ccm64_enc2_loop -DB 102,15,56,220,209 -DB 102,15,56,220,217 - paddq xmm6,xmm9 - dec rdx -DB 102,15,56,221,208 -DB 102,15,56,221,216 - - lea rdi,[16+rdi] - xorps xmm8,xmm2 - movdqa xmm2,xmm6 - movups XMMWORD[rsi],xmm8 -DB 102,15,56,0,215 - lea rsi,[16+rsi] - jnz NEAR $L$ccm64_enc_outer - - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movups XMMWORD[r9],xmm3 - pxor xmm3,xmm3 - pxor xmm8,xmm8 - pxor xmm6,xmm6 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - lea rsp,[88+rsp] -$L$ccm64_enc_ret: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ccm64_encrypt_blocks: -global aesni_ccm64_decrypt_blocks - -ALIGN 16 -aesni_ccm64_decrypt_blocks: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ccm64_decrypt_blocks: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rsp,[((-88))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 -$L$ccm64_dec_body: - mov eax,DWORD[240+rcx] - movups xmm6,XMMWORD[r8] - movdqu xmm3,XMMWORD[r9] - movdqa xmm9,XMMWORD[$L$increment64] - movdqa xmm7,XMMWORD[$L$bswap_mask] - - movaps xmm2,xmm6 - mov r10d,eax - mov r11,rcx -DB 102,15,56,0,247 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_5: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_5 -DB 102,15,56,221,209 - shl r10d,4 - mov eax,16 - movups xmm8,XMMWORD[rdi] - paddq xmm6,xmm9 - lea rdi,[16+rdi] - sub rax,r10 - lea rcx,[32+r10*1+r11] - mov r10,rax - jmp NEAR $L$ccm64_dec_outer -ALIGN 16 -$L$ccm64_dec_outer: - xorps xmm8,xmm2 - movdqa xmm2,xmm6 - movups XMMWORD[rsi],xmm8 - lea rsi,[16+rsi] -DB 102,15,56,0,215 - - sub rdx,1 - jz NEAR $L$ccm64_dec_break - - movups xmm0,XMMWORD[r11] - mov rax,r10 - movups xmm1,XMMWORD[16+r11] - xorps xmm8,xmm0 - xorps xmm2,xmm0 - xorps xmm3,xmm8 - movups xmm0,XMMWORD[32+r11] - jmp NEAR $L$ccm64_dec2_loop -ALIGN 16 -$L$ccm64_dec2_loop: -DB 102,15,56,220,209 -DB 102,15,56,220,217 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 -DB 102,15,56,220,208 -DB 102,15,56,220,216 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ccm64_dec2_loop - movups xmm8,XMMWORD[rdi] - paddq xmm6,xmm9 -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,221,208 -DB 102,15,56,221,216 - lea rdi,[16+rdi] - jmp NEAR $L$ccm64_dec_outer - -ALIGN 16 -$L$ccm64_dec_break: - - mov eax,DWORD[240+r11] - movups xmm0,XMMWORD[r11] - movups xmm1,XMMWORD[16+r11] - xorps xmm8,xmm0 - lea r11,[32+r11] - xorps xmm3,xmm8 -$L$oop_enc1_6: -DB 102,15,56,220,217 - dec eax - movups xmm1,XMMWORD[r11] - lea r11,[16+r11] - jnz NEAR $L$oop_enc1_6 -DB 102,15,56,221,217 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movups XMMWORD[r9],xmm3 - pxor xmm3,xmm3 - pxor xmm8,xmm8 - pxor xmm6,xmm6 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - lea rsp,[88+rsp] -$L$ccm64_dec_ret: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ccm64_decrypt_blocks: -global aesni_ctr32_encrypt_blocks - -ALIGN 16 -aesni_ctr32_encrypt_blocks: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ctr32_encrypt_blocks: +$L$SEH_begin_aes_hw_ctr32_encrypt_blocks: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1100,6 +917,10 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks: mov r8,QWORD[40+rsp] + +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[BORINGSSL_function_hit],1 +%endif cmp rdx,1 jne NEAR $L$ctr32_bulk @@ -1112,12 +933,12 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_enc1_7: +$L$oop_enc1_5: DB 102,15,56,220,209 dec edx movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_7 + jnz NEAR $L$oop_enc1_5 DB 102,15,56,221,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -1130,7 +951,9 @@ DB 102,15,56,221,209 ALIGN 16 $L$ctr32_bulk: lea r11,[rsp] + push rbp + sub rsp,288 and rsp,-16 movaps XMMWORD[(-168)+r11],xmm6 @@ -1686,20 +1509,23 @@ $L$ctr32_done: movaps XMMWORD[96+rsp],xmm0 movaps XMMWORD[112+rsp],xmm0 mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] + $L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ctr32_encrypt_blocks: -global aesni_xts_encrypt + +$L$SEH_end_aes_hw_ctr32_encrypt_blocks: +global aes_hw_cbc_encrypt ALIGN 16 -aesni_xts_encrypt: +aes_hw_cbc_encrypt: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_aesni_xts_encrypt: +$L$SEH_begin_aes_hw_cbc_encrypt: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1708,1928 +1534,6 @@ $L$SEH_begin_aesni_xts_encrypt: mov r9,QWORD[48+rsp] - lea r11,[rsp] - push rbp - sub rsp,272 - and rsp,-16 - movaps XMMWORD[(-168)+r11],xmm6 - movaps XMMWORD[(-152)+r11],xmm7 - movaps XMMWORD[(-136)+r11],xmm8 - movaps XMMWORD[(-120)+r11],xmm9 - movaps XMMWORD[(-104)+r11],xmm10 - movaps XMMWORD[(-88)+r11],xmm11 - movaps XMMWORD[(-72)+r11],xmm12 - movaps XMMWORD[(-56)+r11],xmm13 - movaps XMMWORD[(-40)+r11],xmm14 - movaps XMMWORD[(-24)+r11],xmm15 -$L$xts_enc_body: - movups xmm2,XMMWORD[r9] - mov eax,DWORD[240+r8] - mov r10d,DWORD[240+rcx] - movups xmm0,XMMWORD[r8] - movups xmm1,XMMWORD[16+r8] - lea r8,[32+r8] - xorps xmm2,xmm0 -$L$oop_enc1_8: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[r8] - lea r8,[16+r8] - jnz NEAR $L$oop_enc1_8 -DB 102,15,56,221,209 - movups xmm0,XMMWORD[rcx] - mov rbp,rcx - mov eax,r10d - shl r10d,4 - mov r9,rdx - and rdx,-16 - - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqa xmm8,XMMWORD[$L$xts_magic] - movdqa xmm15,xmm2 - pshufd xmm9,xmm2,0x5f - pxor xmm1,xmm0 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm10,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm10,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm11,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm11,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm12,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm12,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm13,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm13,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm15 - psrad xmm9,31 - paddq xmm15,xmm15 - pand xmm9,xmm8 - pxor xmm14,xmm0 - pxor xmm15,xmm9 - movaps XMMWORD[96+rsp],xmm1 - - sub rdx,16*6 - jc NEAR $L$xts_enc_short - - mov eax,16+96 - lea rcx,[32+r10*1+rbp] - sub rax,r10 - movups xmm1,XMMWORD[16+rbp] - mov r10,rax - lea r8,[$L$xts_magic] - jmp NEAR $L$xts_enc_grandloop - -ALIGN 32 -$L$xts_enc_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqa xmm8,xmm0 - movdqu xmm3,XMMWORD[16+rdi] - pxor xmm2,xmm10 - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm3,xmm11 -DB 102,15,56,220,209 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm4,xmm12 -DB 102,15,56,220,217 - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm5,xmm13 -DB 102,15,56,220,225 - movdqu xmm7,XMMWORD[80+rdi] - pxor xmm8,xmm15 - movdqa xmm9,XMMWORD[96+rsp] - pxor xmm6,xmm14 -DB 102,15,56,220,233 - movups xmm0,XMMWORD[32+rbp] - lea rdi,[96+rdi] - pxor xmm7,xmm8 - - pxor xmm10,xmm9 -DB 102,15,56,220,241 - pxor xmm11,xmm9 - movdqa XMMWORD[rsp],xmm10 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+rbp] - pxor xmm12,xmm9 - -DB 102,15,56,220,208 - pxor xmm13,xmm9 - movdqa XMMWORD[16+rsp],xmm11 -DB 102,15,56,220,216 - pxor xmm14,xmm9 - movdqa XMMWORD[32+rsp],xmm12 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - pxor xmm8,xmm9 - movdqa XMMWORD[64+rsp],xmm14 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+rbp] - movdqa XMMWORD[80+rsp],xmm8 - pshufd xmm9,xmm15,0x5f - jmp NEAR $L$xts_enc_loop6 -ALIGN 32 -$L$xts_enc_loop6: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-64))+rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-80))+rax*1+rcx] - jnz NEAR $L$xts_enc_loop6 - - movdqa xmm8,XMMWORD[r8] - movdqa xmm14,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,220,209 - paddq xmm15,xmm15 - psrad xmm14,31 -DB 102,15,56,220,217 - pand xmm14,xmm8 - movups xmm10,XMMWORD[rbp] -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 - pxor xmm15,xmm14 - movaps xmm11,xmm10 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-64))+rcx] - - movdqa xmm14,xmm9 -DB 102,15,56,220,208 - paddd xmm9,xmm9 - pxor xmm10,xmm15 -DB 102,15,56,220,216 - psrad xmm14,31 - paddq xmm15,xmm15 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - pand xmm14,xmm8 - movaps xmm12,xmm11 -DB 102,15,56,220,240 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-48))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,220,209 - pxor xmm11,xmm15 - psrad xmm14,31 -DB 102,15,56,220,217 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movdqa XMMWORD[48+rsp],xmm13 - pxor xmm15,xmm14 -DB 102,15,56,220,241 - movaps xmm13,xmm12 - movdqa xmm14,xmm9 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[((-32))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,220,208 - pxor xmm12,xmm15 - psrad xmm14,31 -DB 102,15,56,220,216 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 - pxor xmm15,xmm14 - movaps xmm14,xmm13 -DB 102,15,56,220,248 - - movdqa xmm0,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,220,209 - pxor xmm13,xmm15 - psrad xmm0,31 -DB 102,15,56,220,217 - paddq xmm15,xmm15 - pand xmm0,xmm8 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - pxor xmm15,xmm0 - movups xmm0,XMMWORD[rbp] -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+rbp] - - pxor xmm14,xmm15 -DB 102,15,56,221,84,36,0 - psrad xmm9,31 - paddq xmm15,xmm15 -DB 102,15,56,221,92,36,16 -DB 102,15,56,221,100,36,32 - pand xmm9,xmm8 - mov rax,r10 -DB 102,15,56,221,108,36,48 -DB 102,15,56,221,116,36,64 -DB 102,15,56,221,124,36,80 - pxor xmm15,xmm9 - - lea rsi,[96+rsi] - movups XMMWORD[(-96)+rsi],xmm2 - movups XMMWORD[(-80)+rsi],xmm3 - movups XMMWORD[(-64)+rsi],xmm4 - movups XMMWORD[(-48)+rsi],xmm5 - movups XMMWORD[(-32)+rsi],xmm6 - movups XMMWORD[(-16)+rsi],xmm7 - sub rdx,16*6 - jnc NEAR $L$xts_enc_grandloop - - mov eax,16+96 - sub eax,r10d - mov rcx,rbp - shr eax,4 - -$L$xts_enc_short: - - mov r10d,eax - pxor xmm10,xmm0 - add rdx,16*6 - jz NEAR $L$xts_enc_done - - pxor xmm11,xmm0 - cmp rdx,0x20 - jb NEAR $L$xts_enc_one - pxor xmm12,xmm0 - je NEAR $L$xts_enc_two - - pxor xmm13,xmm0 - cmp rdx,0x40 - jb NEAR $L$xts_enc_three - pxor xmm14,xmm0 - je NEAR $L$xts_enc_four - - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm2,xmm10 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm3,xmm11 - movdqu xmm6,XMMWORD[64+rdi] - lea rdi,[80+rdi] - pxor xmm4,xmm12 - pxor xmm5,xmm13 - pxor xmm6,xmm14 - pxor xmm7,xmm7 - - call _aesni_encrypt6 - - xorps xmm2,xmm10 - movdqa xmm10,xmm15 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - xorps xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - xorps xmm6,xmm14 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - movdqu XMMWORD[64+rsi],xmm6 - lea rsi,[80+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_one: - movups xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_9: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_9 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movdqa xmm10,xmm11 - movups XMMWORD[rsi],xmm2 - lea rsi,[16+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_two: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - lea rdi,[32+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - - call _aesni_encrypt2 - - xorps xmm2,xmm10 - movdqa xmm10,xmm12 - xorps xmm3,xmm11 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - lea rsi,[32+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_three: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - lea rdi,[48+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - - call _aesni_encrypt3 - - xorps xmm2,xmm10 - movdqa xmm10,xmm13 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - lea rsi,[48+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_four: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - xorps xmm2,xmm10 - movups xmm5,XMMWORD[48+rdi] - lea rdi,[64+rdi] - xorps xmm3,xmm11 - xorps xmm4,xmm12 - xorps xmm5,xmm13 - - call _aesni_encrypt4 - - pxor xmm2,xmm10 - movdqa xmm10,xmm14 - pxor xmm3,xmm11 - pxor xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - pxor xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - lea rsi,[64+rsi] - jmp NEAR $L$xts_enc_done - -ALIGN 16 -$L$xts_enc_done: - and r9,15 - jz NEAR $L$xts_enc_ret - mov rdx,r9 - -$L$xts_enc_steal: - movzx eax,BYTE[rdi] - movzx ecx,BYTE[((-16))+rsi] - lea rdi,[1+rdi] - mov BYTE[((-16))+rsi],al - mov BYTE[rsi],cl - lea rsi,[1+rsi] - sub rdx,1 - jnz NEAR $L$xts_enc_steal - - sub rsi,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[((-16))+rsi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_10: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_10 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movups XMMWORD[(-16)+rsi],xmm2 - -$L$xts_enc_ret: - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - lea rsp,[r11] -$L$xts_enc_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_xts_encrypt: -global aesni_xts_decrypt - -ALIGN 16 -aesni_xts_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_xts_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea r11,[rsp] - push rbp - sub rsp,272 - and rsp,-16 - movaps XMMWORD[(-168)+r11],xmm6 - movaps XMMWORD[(-152)+r11],xmm7 - movaps XMMWORD[(-136)+r11],xmm8 - movaps XMMWORD[(-120)+r11],xmm9 - movaps XMMWORD[(-104)+r11],xmm10 - movaps XMMWORD[(-88)+r11],xmm11 - movaps XMMWORD[(-72)+r11],xmm12 - movaps XMMWORD[(-56)+r11],xmm13 - movaps XMMWORD[(-40)+r11],xmm14 - movaps XMMWORD[(-24)+r11],xmm15 -$L$xts_dec_body: - movups xmm2,XMMWORD[r9] - mov eax,DWORD[240+r8] - mov r10d,DWORD[240+rcx] - movups xmm0,XMMWORD[r8] - movups xmm1,XMMWORD[16+r8] - lea r8,[32+r8] - xorps xmm2,xmm0 -$L$oop_enc1_11: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD[r8] - lea r8,[16+r8] - jnz NEAR $L$oop_enc1_11 -DB 102,15,56,221,209 - xor eax,eax - test rdx,15 - setnz al - shl rax,4 - sub rdx,rax - - movups xmm0,XMMWORD[rcx] - mov rbp,rcx - mov eax,r10d - shl r10d,4 - mov r9,rdx - and rdx,-16 - - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqa xmm8,XMMWORD[$L$xts_magic] - movdqa xmm15,xmm2 - pshufd xmm9,xmm2,0x5f - pxor xmm1,xmm0 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm10,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm10,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm11,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm11,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm12,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm12,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 - paddd xmm9,xmm9 - movdqa xmm13,xmm15 - psrad xmm14,31 - paddq xmm15,xmm15 - pand xmm14,xmm8 - pxor xmm13,xmm0 - pxor xmm15,xmm14 - movdqa xmm14,xmm15 - psrad xmm9,31 - paddq xmm15,xmm15 - pand xmm9,xmm8 - pxor xmm14,xmm0 - pxor xmm15,xmm9 - movaps XMMWORD[96+rsp],xmm1 - - sub rdx,16*6 - jc NEAR $L$xts_dec_short - - mov eax,16+96 - lea rcx,[32+r10*1+rbp] - sub rax,r10 - movups xmm1,XMMWORD[16+rbp] - mov r10,rax - lea r8,[$L$xts_magic] - jmp NEAR $L$xts_dec_grandloop - -ALIGN 32 -$L$xts_dec_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqa xmm8,xmm0 - movdqu xmm3,XMMWORD[16+rdi] - pxor xmm2,xmm10 - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm3,xmm11 -DB 102,15,56,222,209 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm4,xmm12 -DB 102,15,56,222,217 - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm5,xmm13 -DB 102,15,56,222,225 - movdqu xmm7,XMMWORD[80+rdi] - pxor xmm8,xmm15 - movdqa xmm9,XMMWORD[96+rsp] - pxor xmm6,xmm14 -DB 102,15,56,222,233 - movups xmm0,XMMWORD[32+rbp] - lea rdi,[96+rdi] - pxor xmm7,xmm8 - - pxor xmm10,xmm9 -DB 102,15,56,222,241 - pxor xmm11,xmm9 - movdqa XMMWORD[rsp],xmm10 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+rbp] - pxor xmm12,xmm9 - -DB 102,15,56,222,208 - pxor xmm13,xmm9 - movdqa XMMWORD[16+rsp],xmm11 -DB 102,15,56,222,216 - pxor xmm14,xmm9 - movdqa XMMWORD[32+rsp],xmm12 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - pxor xmm8,xmm9 - movdqa XMMWORD[64+rsp],xmm14 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+rbp] - movdqa XMMWORD[80+rsp],xmm8 - pshufd xmm9,xmm15,0x5f - jmp NEAR $L$xts_dec_loop6 -ALIGN 32 -$L$xts_dec_loop6: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-64))+rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-80))+rax*1+rcx] - jnz NEAR $L$xts_dec_loop6 - - movdqa xmm8,XMMWORD[r8] - movdqa xmm14,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,222,209 - paddq xmm15,xmm15 - psrad xmm14,31 -DB 102,15,56,222,217 - pand xmm14,xmm8 - movups xmm10,XMMWORD[rbp] -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 - pxor xmm15,xmm14 - movaps xmm11,xmm10 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-64))+rcx] - - movdqa xmm14,xmm9 -DB 102,15,56,222,208 - paddd xmm9,xmm9 - pxor xmm10,xmm15 -DB 102,15,56,222,216 - psrad xmm14,31 - paddq xmm15,xmm15 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - pand xmm14,xmm8 - movaps xmm12,xmm11 -DB 102,15,56,222,240 - pxor xmm15,xmm14 - movdqa xmm14,xmm9 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-48))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,222,209 - pxor xmm11,xmm15 - psrad xmm14,31 -DB 102,15,56,222,217 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movdqa XMMWORD[48+rsp],xmm13 - pxor xmm15,xmm14 -DB 102,15,56,222,241 - movaps xmm13,xmm12 - movdqa xmm14,xmm9 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[((-32))+rcx] - - paddd xmm9,xmm9 -DB 102,15,56,222,208 - pxor xmm12,xmm15 - psrad xmm14,31 -DB 102,15,56,222,216 - paddq xmm15,xmm15 - pand xmm14,xmm8 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 - pxor xmm15,xmm14 - movaps xmm14,xmm13 -DB 102,15,56,222,248 - - movdqa xmm0,xmm9 - paddd xmm9,xmm9 -DB 102,15,56,222,209 - pxor xmm13,xmm15 - psrad xmm0,31 -DB 102,15,56,222,217 - paddq xmm15,xmm15 - pand xmm0,xmm8 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - pxor xmm15,xmm0 - movups xmm0,XMMWORD[rbp] -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+rbp] - - pxor xmm14,xmm15 -DB 102,15,56,223,84,36,0 - psrad xmm9,31 - paddq xmm15,xmm15 -DB 102,15,56,223,92,36,16 -DB 102,15,56,223,100,36,32 - pand xmm9,xmm8 - mov rax,r10 -DB 102,15,56,223,108,36,48 -DB 102,15,56,223,116,36,64 -DB 102,15,56,223,124,36,80 - pxor xmm15,xmm9 - - lea rsi,[96+rsi] - movups XMMWORD[(-96)+rsi],xmm2 - movups XMMWORD[(-80)+rsi],xmm3 - movups XMMWORD[(-64)+rsi],xmm4 - movups XMMWORD[(-48)+rsi],xmm5 - movups XMMWORD[(-32)+rsi],xmm6 - movups XMMWORD[(-16)+rsi],xmm7 - sub rdx,16*6 - jnc NEAR $L$xts_dec_grandloop - - mov eax,16+96 - sub eax,r10d - mov rcx,rbp - shr eax,4 - -$L$xts_dec_short: - - mov r10d,eax - pxor xmm10,xmm0 - pxor xmm11,xmm0 - add rdx,16*6 - jz NEAR $L$xts_dec_done - - pxor xmm12,xmm0 - cmp rdx,0x20 - jb NEAR $L$xts_dec_one - pxor xmm13,xmm0 - je NEAR $L$xts_dec_two - - pxor xmm14,xmm0 - cmp rdx,0x40 - jb NEAR $L$xts_dec_three - je NEAR $L$xts_dec_four - - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - pxor xmm2,xmm10 - movdqu xmm5,XMMWORD[48+rdi] - pxor xmm3,xmm11 - movdqu xmm6,XMMWORD[64+rdi] - lea rdi,[80+rdi] - pxor xmm4,xmm12 - pxor xmm5,xmm13 - pxor xmm6,xmm14 - - call _aesni_decrypt6 - - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - xorps xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - xorps xmm6,xmm14 - movdqu XMMWORD[32+rsi],xmm4 - pxor xmm14,xmm14 - movdqu XMMWORD[48+rsi],xmm5 - pcmpgtd xmm14,xmm15 - movdqu XMMWORD[64+rsi],xmm6 - lea rsi,[80+rsi] - pshufd xmm11,xmm14,0x13 - and r9,15 - jz NEAR $L$xts_dec_ret - - movdqa xmm10,xmm15 - paddq xmm15,xmm15 - pand xmm11,xmm8 - pxor xmm11,xmm15 - jmp NEAR $L$xts_dec_done2 - -ALIGN 16 -$L$xts_dec_one: - movups xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_12: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_12 -DB 102,15,56,223,209 - xorps xmm2,xmm10 - movdqa xmm10,xmm11 - movups XMMWORD[rsi],xmm2 - movdqa xmm11,xmm12 - lea rsi,[16+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_two: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - lea rdi,[32+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - - call _aesni_decrypt2 - - xorps xmm2,xmm10 - movdqa xmm10,xmm12 - xorps xmm3,xmm11 - movdqa xmm11,xmm13 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - lea rsi,[32+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_three: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - lea rdi,[48+rdi] - xorps xmm2,xmm10 - xorps xmm3,xmm11 - xorps xmm4,xmm12 - - call _aesni_decrypt3 - - xorps xmm2,xmm10 - movdqa xmm10,xmm13 - xorps xmm3,xmm11 - movdqa xmm11,xmm14 - xorps xmm4,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - lea rsi,[48+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_four: - movups xmm2,XMMWORD[rdi] - movups xmm3,XMMWORD[16+rdi] - movups xmm4,XMMWORD[32+rdi] - xorps xmm2,xmm10 - movups xmm5,XMMWORD[48+rdi] - lea rdi,[64+rdi] - xorps xmm3,xmm11 - xorps xmm4,xmm12 - xorps xmm5,xmm13 - - call _aesni_decrypt4 - - pxor xmm2,xmm10 - movdqa xmm10,xmm14 - pxor xmm3,xmm11 - movdqa xmm11,xmm15 - pxor xmm4,xmm12 - movdqu XMMWORD[rsi],xmm2 - pxor xmm5,xmm13 - movdqu XMMWORD[16+rsi],xmm3 - movdqu XMMWORD[32+rsi],xmm4 - movdqu XMMWORD[48+rsi],xmm5 - lea rsi,[64+rsi] - jmp NEAR $L$xts_dec_done - -ALIGN 16 -$L$xts_dec_done: - and r9,15 - jz NEAR $L$xts_dec_ret -$L$xts_dec_done2: - mov rdx,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[rdi] - xorps xmm2,xmm11 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_13: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_13 -DB 102,15,56,223,209 - xorps xmm2,xmm11 - movups XMMWORD[rsi],xmm2 - -$L$xts_dec_steal: - movzx eax,BYTE[16+rdi] - movzx ecx,BYTE[rsi] - lea rdi,[1+rdi] - mov BYTE[rsi],al - mov BYTE[16+rsi],cl - lea rsi,[1+rsi] - sub rdx,1 - jnz NEAR $L$xts_dec_steal - - sub rsi,r9 - mov rcx,rbp - mov eax,r10d - - movups xmm2,XMMWORD[rsi] - xorps xmm2,xmm10 - movups xmm0,XMMWORD[rcx] - movups xmm1,XMMWORD[16+rcx] - lea rcx,[32+rcx] - xorps xmm2,xmm0 -$L$oop_dec1_14: -DB 102,15,56,222,209 - dec eax - movups xmm1,XMMWORD[rcx] - lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_14 -DB 102,15,56,223,209 - xorps xmm2,xmm10 - movups XMMWORD[rsi],xmm2 - -$L$xts_dec_ret: - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-168))+r11] - movaps XMMWORD[(-168)+r11],xmm0 - movaps xmm7,XMMWORD[((-152))+r11] - movaps XMMWORD[(-152)+r11],xmm0 - movaps xmm8,XMMWORD[((-136))+r11] - movaps XMMWORD[(-136)+r11],xmm0 - movaps xmm9,XMMWORD[((-120))+r11] - movaps XMMWORD[(-120)+r11],xmm0 - movaps xmm10,XMMWORD[((-104))+r11] - movaps XMMWORD[(-104)+r11],xmm0 - movaps xmm11,XMMWORD[((-88))+r11] - movaps XMMWORD[(-88)+r11],xmm0 - movaps xmm12,XMMWORD[((-72))+r11] - movaps XMMWORD[(-72)+r11],xmm0 - movaps xmm13,XMMWORD[((-56))+r11] - movaps XMMWORD[(-56)+r11],xmm0 - movaps xmm14,XMMWORD[((-40))+r11] - movaps XMMWORD[(-40)+r11],xmm0 - movaps xmm15,XMMWORD[((-24))+r11] - movaps XMMWORD[(-24)+r11],xmm0 - movaps XMMWORD[rsp],xmm0 - movaps XMMWORD[16+rsp],xmm0 - movaps XMMWORD[32+rsp],xmm0 - movaps XMMWORD[48+rsp],xmm0 - movaps XMMWORD[64+rsp],xmm0 - movaps XMMWORD[80+rsp],xmm0 - movaps XMMWORD[96+rsp],xmm0 - mov rbp,QWORD[((-8))+r11] - lea rsp,[r11] -$L$xts_dec_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_xts_decrypt: -global aesni_ocb_encrypt - -ALIGN 32 -aesni_ocb_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ocb_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rax,[rsp] - push rbx - push rbp - push r12 - push r13 - push r14 - lea rsp,[((-160))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[96+rsp],xmm12 - movaps XMMWORD[112+rsp],xmm13 - movaps XMMWORD[128+rsp],xmm14 - movaps XMMWORD[144+rsp],xmm15 -$L$ocb_enc_body: - mov rbx,QWORD[56+rax] - mov rbp,QWORD[((56+8))+rax] - - mov r10d,DWORD[240+rcx] - mov r11,rcx - shl r10d,4 - movups xmm9,XMMWORD[rcx] - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqu xmm15,XMMWORD[r9] - pxor xmm9,xmm1 - pxor xmm15,xmm1 - - mov eax,16+32 - lea rcx,[32+r10*1+r11] - movups xmm1,XMMWORD[16+r11] - sub rax,r10 - mov r10,rax - - movdqu xmm10,XMMWORD[rbx] - movdqu xmm8,XMMWORD[rbp] - - test r8,1 - jnz NEAR $L$ocb_enc_odd - - bsf r12,r8 - add r8,1 - shl r12,4 - movdqu xmm7,XMMWORD[r12*1+rbx] - movdqu xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - - call __ocb_encrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - lea rsi,[16+rsi] - sub rdx,1 - jz NEAR $L$ocb_enc_done - -$L$ocb_enc_odd: - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - lea r8,[6+r8] - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - shl r12,4 - shl r13,4 - shl r14,4 - - sub rdx,6 - jc NEAR $L$ocb_enc_short - jmp NEAR $L$ocb_enc_grandloop - -ALIGN 32 -$L$ocb_enc_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - movdqu xmm5,XMMWORD[48+rdi] - movdqu xmm6,XMMWORD[64+rdi] - movdqu xmm7,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - call __ocb_encrypt6 - - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - movups XMMWORD[64+rsi],xmm6 - movups XMMWORD[80+rsi],xmm7 - lea rsi,[96+rsi] - sub rdx,6 - jnc NEAR $L$ocb_enc_grandloop - -$L$ocb_enc_short: - add rdx,6 - jz NEAR $L$ocb_enc_done - - movdqu xmm2,XMMWORD[rdi] - cmp rdx,2 - jb NEAR $L$ocb_enc_one - movdqu xmm3,XMMWORD[16+rdi] - je NEAR $L$ocb_enc_two - - movdqu xmm4,XMMWORD[32+rdi] - cmp rdx,4 - jb NEAR $L$ocb_enc_three - movdqu xmm5,XMMWORD[48+rdi] - je NEAR $L$ocb_enc_four - - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm7,xmm7 - - call __ocb_encrypt6 - - movdqa xmm15,xmm14 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - movups XMMWORD[64+rsi],xmm6 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_one: - movdqa xmm7,xmm10 - - call __ocb_encrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_two: - pxor xmm4,xmm4 - pxor xmm5,xmm5 - - call __ocb_encrypt4 - - movdqa xmm15,xmm11 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_three: - pxor xmm5,xmm5 - - call __ocb_encrypt4 - - movdqa xmm15,xmm12 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - - jmp NEAR $L$ocb_enc_done - -ALIGN 16 -$L$ocb_enc_four: - call __ocb_encrypt4 - - movdqa xmm15,xmm13 - movups XMMWORD[rsi],xmm2 - movups XMMWORD[16+rsi],xmm3 - movups XMMWORD[32+rsi],xmm4 - movups XMMWORD[48+rsi],xmm5 - -$L$ocb_enc_done: - pxor xmm15,xmm0 - movdqu XMMWORD[rbp],xmm8 - movdqu XMMWORD[r9],xmm15 - - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - movaps xmm10,XMMWORD[64+rsp] - movaps XMMWORD[64+rsp],xmm0 - movaps xmm11,XMMWORD[80+rsp] - movaps XMMWORD[80+rsp],xmm0 - movaps xmm12,XMMWORD[96+rsp] - movaps XMMWORD[96+rsp],xmm0 - movaps xmm13,XMMWORD[112+rsp] - movaps XMMWORD[112+rsp],xmm0 - movaps xmm14,XMMWORD[128+rsp] - movaps XMMWORD[128+rsp],xmm0 - movaps xmm15,XMMWORD[144+rsp] - movaps XMMWORD[144+rsp],xmm0 - lea rax,[((160+40))+rsp] -$L$ocb_enc_pop: - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$ocb_enc_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ocb_encrypt: - - -ALIGN 32 -__ocb_encrypt6: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - movdqa xmm14,xmm10 - pxor xmm10,xmm15 - movdqu xmm15,XMMWORD[r14*1+rbx] - pxor xmm11,xmm10 - pxor xmm8,xmm2 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm8,xmm3 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm8,xmm4 - pxor xmm4,xmm12 - pxor xmm14,xmm13 - pxor xmm8,xmm5 - pxor xmm5,xmm13 - pxor xmm15,xmm14 - pxor xmm8,xmm6 - pxor xmm6,xmm14 - pxor xmm8,xmm7 - pxor xmm7,xmm15 - movups xmm0,XMMWORD[32+r11] - - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - add r8,6 - pxor xmm10,xmm9 - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - pxor xmm11,xmm9 - pxor xmm12,xmm9 -DB 102,15,56,220,241 - pxor xmm13,xmm9 - pxor xmm14,xmm9 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+r11] - pxor xmm15,xmm9 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+r11] - shl r12,4 - shl r13,4 - jmp NEAR $L$ocb_enc_loop6 - -ALIGN 32 -$L$ocb_enc_loop6: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 -DB 102,15,56,220,240 -DB 102,15,56,220,248 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop6 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+r11] - shl r14,4 - -DB 102,65,15,56,221,210 - movdqu xmm10,XMMWORD[rbx] - mov rax,r10 -DB 102,65,15,56,221,219 -DB 102,65,15,56,221,228 -DB 102,65,15,56,221,237 -DB 102,65,15,56,221,246 -DB 102,65,15,56,221,255 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_encrypt4: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - pxor xmm10,xmm15 - pxor xmm11,xmm10 - pxor xmm8,xmm2 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm8,xmm3 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm8,xmm4 - pxor xmm4,xmm12 - pxor xmm8,xmm5 - pxor xmm5,xmm13 - movups xmm0,XMMWORD[32+r11] - - pxor xmm10,xmm9 - pxor xmm11,xmm9 - pxor xmm12,xmm9 - pxor xmm13,xmm9 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[48+r11] - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_enc_loop4 - -ALIGN 32 -$L$ocb_enc_loop4: -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 -DB 102,15,56,220,216 -DB 102,15,56,220,224 -DB 102,15,56,220,232 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop4 - -DB 102,15,56,220,209 -DB 102,15,56,220,217 -DB 102,15,56,220,225 -DB 102,15,56,220,233 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,65,15,56,221,210 -DB 102,65,15,56,221,219 -DB 102,65,15,56,221,228 -DB 102,65,15,56,221,237 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_encrypt1: - pxor xmm7,xmm15 - pxor xmm7,xmm9 - pxor xmm8,xmm2 - pxor xmm2,xmm7 - movups xmm0,XMMWORD[32+r11] - -DB 102,15,56,220,209 - movups xmm1,XMMWORD[48+r11] - pxor xmm7,xmm9 - -DB 102,15,56,220,208 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_enc_loop1 - -ALIGN 32 -$L$ocb_enc_loop1: -DB 102,15,56,220,209 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,220,208 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_enc_loop1 - -DB 102,15,56,220,209 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,15,56,221,215 - DB 0F3h,0C3h ;repret - - -global aesni_ocb_decrypt - -ALIGN 32 -aesni_ocb_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_ocb_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - lea rax,[rsp] - push rbx - push rbp - push r12 - push r13 - push r14 - lea rsp,[((-160))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[96+rsp],xmm12 - movaps XMMWORD[112+rsp],xmm13 - movaps XMMWORD[128+rsp],xmm14 - movaps XMMWORD[144+rsp],xmm15 -$L$ocb_dec_body: - mov rbx,QWORD[56+rax] - mov rbp,QWORD[((56+8))+rax] - - mov r10d,DWORD[240+rcx] - mov r11,rcx - shl r10d,4 - movups xmm9,XMMWORD[rcx] - movups xmm1,XMMWORD[16+r10*1+rcx] - - movdqu xmm15,XMMWORD[r9] - pxor xmm9,xmm1 - pxor xmm15,xmm1 - - mov eax,16+32 - lea rcx,[32+r10*1+r11] - movups xmm1,XMMWORD[16+r11] - sub rax,r10 - mov r10,rax - - movdqu xmm10,XMMWORD[rbx] - movdqu xmm8,XMMWORD[rbp] - - test r8,1 - jnz NEAR $L$ocb_dec_odd - - bsf r12,r8 - add r8,1 - shl r12,4 - movdqu xmm7,XMMWORD[r12*1+rbx] - movdqu xmm2,XMMWORD[rdi] - lea rdi,[16+rdi] - - call __ocb_decrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - lea rsi,[16+rsi] - sub rdx,1 - jz NEAR $L$ocb_dec_done - -$L$ocb_dec_odd: - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - lea r8,[6+r8] - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - shl r12,4 - shl r13,4 - shl r14,4 - - sub rdx,6 - jc NEAR $L$ocb_dec_short - jmp NEAR $L$ocb_dec_grandloop - -ALIGN 32 -$L$ocb_dec_grandloop: - movdqu xmm2,XMMWORD[rdi] - movdqu xmm3,XMMWORD[16+rdi] - movdqu xmm4,XMMWORD[32+rdi] - movdqu xmm5,XMMWORD[48+rdi] - movdqu xmm6,XMMWORD[64+rdi] - movdqu xmm7,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - call __ocb_decrypt6 - - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - movups XMMWORD[64+rsi],xmm6 - pxor xmm8,xmm6 - movups XMMWORD[80+rsi],xmm7 - pxor xmm8,xmm7 - lea rsi,[96+rsi] - sub rdx,6 - jnc NEAR $L$ocb_dec_grandloop - -$L$ocb_dec_short: - add rdx,6 - jz NEAR $L$ocb_dec_done - - movdqu xmm2,XMMWORD[rdi] - cmp rdx,2 - jb NEAR $L$ocb_dec_one - movdqu xmm3,XMMWORD[16+rdi] - je NEAR $L$ocb_dec_two - - movdqu xmm4,XMMWORD[32+rdi] - cmp rdx,4 - jb NEAR $L$ocb_dec_three - movdqu xmm5,XMMWORD[48+rdi] - je NEAR $L$ocb_dec_four - - movdqu xmm6,XMMWORD[64+rdi] - pxor xmm7,xmm7 - - call __ocb_decrypt6 - - movdqa xmm15,xmm14 - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - movups XMMWORD[64+rsi],xmm6 - pxor xmm8,xmm6 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_one: - movdqa xmm7,xmm10 - - call __ocb_decrypt1 - - movdqa xmm15,xmm7 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_two: - pxor xmm4,xmm4 - pxor xmm5,xmm5 - - call __ocb_decrypt4 - - movdqa xmm15,xmm11 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - xorps xmm8,xmm3 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_three: - pxor xmm5,xmm5 - - call __ocb_decrypt4 - - movdqa xmm15,xmm12 - movups XMMWORD[rsi],xmm2 - xorps xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - xorps xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - xorps xmm8,xmm4 - - jmp NEAR $L$ocb_dec_done - -ALIGN 16 -$L$ocb_dec_four: - call __ocb_decrypt4 - - movdqa xmm15,xmm13 - movups XMMWORD[rsi],xmm2 - pxor xmm8,xmm2 - movups XMMWORD[16+rsi],xmm3 - pxor xmm8,xmm3 - movups XMMWORD[32+rsi],xmm4 - pxor xmm8,xmm4 - movups XMMWORD[48+rsi],xmm5 - pxor xmm8,xmm5 - -$L$ocb_dec_done: - pxor xmm15,xmm0 - movdqu XMMWORD[rbp],xmm8 - movdqu XMMWORD[r9],xmm15 - - xorps xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - movaps xmm6,XMMWORD[rsp] - movaps XMMWORD[rsp],xmm0 - movaps xmm7,XMMWORD[16+rsp] - movaps XMMWORD[16+rsp],xmm0 - movaps xmm8,XMMWORD[32+rsp] - movaps XMMWORD[32+rsp],xmm0 - movaps xmm9,XMMWORD[48+rsp] - movaps XMMWORD[48+rsp],xmm0 - movaps xmm10,XMMWORD[64+rsp] - movaps XMMWORD[64+rsp],xmm0 - movaps xmm11,XMMWORD[80+rsp] - movaps XMMWORD[80+rsp],xmm0 - movaps xmm12,XMMWORD[96+rsp] - movaps XMMWORD[96+rsp],xmm0 - movaps xmm13,XMMWORD[112+rsp] - movaps XMMWORD[112+rsp],xmm0 - movaps xmm14,XMMWORD[128+rsp] - movaps XMMWORD[128+rsp],xmm0 - movaps xmm15,XMMWORD[144+rsp] - movaps XMMWORD[144+rsp],xmm0 - lea rax,[((160+40))+rsp] -$L$ocb_dec_pop: - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$ocb_dec_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_ocb_decrypt: - - -ALIGN 32 -__ocb_decrypt6: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - movdqa xmm14,xmm10 - pxor xmm10,xmm15 - movdqu xmm15,XMMWORD[r14*1+rbx] - pxor xmm11,xmm10 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm4,xmm12 - pxor xmm14,xmm13 - pxor xmm5,xmm13 - pxor xmm15,xmm14 - pxor xmm6,xmm14 - pxor xmm7,xmm15 - movups xmm0,XMMWORD[32+r11] - - lea r12,[1+r8] - lea r13,[3+r8] - lea r14,[5+r8] - add r8,6 - pxor xmm10,xmm9 - bsf r12,r12 - bsf r13,r13 - bsf r14,r14 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - pxor xmm11,xmm9 - pxor xmm12,xmm9 -DB 102,15,56,222,241 - pxor xmm13,xmm9 - pxor xmm14,xmm9 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+r11] - pxor xmm15,xmm9 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+r11] - shl r12,4 - shl r13,4 - jmp NEAR $L$ocb_dec_loop6 - -ALIGN 32 -$L$ocb_dec_loop6: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 -DB 102,15,56,222,240 -DB 102,15,56,222,248 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop6 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+r11] - shl r14,4 - -DB 102,65,15,56,223,210 - movdqu xmm10,XMMWORD[rbx] - mov rax,r10 -DB 102,65,15,56,223,219 -DB 102,65,15,56,223,228 -DB 102,65,15,56,223,237 -DB 102,65,15,56,223,246 -DB 102,65,15,56,223,255 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_decrypt4: - pxor xmm15,xmm9 - movdqu xmm11,XMMWORD[r12*1+rbx] - movdqa xmm12,xmm10 - movdqu xmm13,XMMWORD[r13*1+rbx] - pxor xmm10,xmm15 - pxor xmm11,xmm10 - pxor xmm2,xmm10 - pxor xmm12,xmm11 - pxor xmm3,xmm11 - pxor xmm13,xmm12 - pxor xmm4,xmm12 - pxor xmm5,xmm13 - movups xmm0,XMMWORD[32+r11] - - pxor xmm10,xmm9 - pxor xmm11,xmm9 - pxor xmm12,xmm9 - pxor xmm13,xmm9 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[48+r11] - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_dec_loop4 - -ALIGN 32 -$L$ocb_dec_loop4: -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 -DB 102,15,56,222,216 -DB 102,15,56,222,224 -DB 102,15,56,222,232 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop4 - -DB 102,15,56,222,209 -DB 102,15,56,222,217 -DB 102,15,56,222,225 -DB 102,15,56,222,233 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,65,15,56,223,210 -DB 102,65,15,56,223,219 -DB 102,65,15,56,223,228 -DB 102,65,15,56,223,237 - DB 0F3h,0C3h ;repret - - - -ALIGN 32 -__ocb_decrypt1: - pxor xmm7,xmm15 - pxor xmm7,xmm9 - pxor xmm2,xmm7 - movups xmm0,XMMWORD[32+r11] - -DB 102,15,56,222,209 - movups xmm1,XMMWORD[48+r11] - pxor xmm7,xmm9 - -DB 102,15,56,222,208 - movups xmm0,XMMWORD[64+r11] - jmp NEAR $L$ocb_dec_loop1 - -ALIGN 32 -$L$ocb_dec_loop1: -DB 102,15,56,222,209 - movups xmm1,XMMWORD[rax*1+rcx] - add rax,32 - -DB 102,15,56,222,208 - movups xmm0,XMMWORD[((-16))+rax*1+rcx] - jnz NEAR $L$ocb_dec_loop1 - -DB 102,15,56,222,209 - movups xmm1,XMMWORD[16+r11] - mov rax,r10 - -DB 102,15,56,223,215 - DB 0F3h,0C3h ;repret - -global aesni_cbc_encrypt - -ALIGN 16 -aesni_cbc_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - test rdx,rdx jz NEAR $L$cbc_ret @@ -3655,12 +1559,12 @@ $L$cbc_enc_loop: xorps xmm3,xmm0 lea rcx,[32+rcx] xorps xmm2,xmm3 -$L$oop_enc1_15: +$L$oop_enc1_6: DB 102,15,56,220,209 dec eax movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_enc1_15 + jnz NEAR $L$oop_enc1_6 DB 102,15,56,221,209 mov eax,r10d mov rcx,r11 @@ -3706,12 +1610,12 @@ $L$cbc_decrypt: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_16: +$L$oop_dec1_7: DB 102,15,56,222,209 dec r10d movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_16 + jnz NEAR $L$oop_dec1_7 DB 102,15,56,223,209 pxor xmm0,xmm0 pxor xmm1,xmm1 @@ -3724,7 +1628,9 @@ DB 102,15,56,223,209 ALIGN 16 $L$cbc_decrypt_bulk: lea r11,[rsp] + push rbp + sub rsp,176 and rsp,-16 movaps XMMWORD[16+rsp],xmm6 @@ -4133,12 +2039,12 @@ $L$cbc_dec_one: movups xmm1,XMMWORD[16+rcx] lea rcx,[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_17: +$L$oop_dec1_8: DB 102,15,56,222,209 dec eax movups xmm1,XMMWORD[rcx] lea rcx,[16+rcx] - jnz NEAR $L$oop_dec1_17 + jnz NEAR $L$oop_dec1_8 DB 102,15,56,223,209 xorps xmm2,xmm10 movaps xmm10,xmm11 @@ -4236,17 +2142,22 @@ $L$cbc_dec_ret: movaps xmm15,XMMWORD[160+rsp] movaps XMMWORD[160+rsp],xmm0 mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] + $L$cbc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_cbc_encrypt: -global aesni_set_decrypt_key + +$L$SEH_end_aes_hw_cbc_encrypt: +global aes_hw_set_decrypt_key ALIGN 16 -aesni_set_decrypt_key: +aes_hw_set_decrypt_key: + DB 0x48,0x83,0xEC,0x08 + call __aesni_set_encrypt_key shl edx,4 test eax,eax @@ -4279,15 +2190,22 @@ DB 102,15,56,219,192 pxor xmm0,xmm0 $L$dec_key_ret: add rsp,8 + DB 0F3h,0C3h ;repret + $L$SEH_end_set_decrypt_key: -global aesni_set_encrypt_key +global aes_hw_set_encrypt_key ALIGN 16 -aesni_set_encrypt_key: +aes_hw_set_encrypt_key: __aesni_set_encrypt_key: + +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[((BORINGSSL_function_hit+3))],1 +%endif DB 0x48,0x83,0xEC,0x08 + mov rax,-1 test rcx,rcx jz NEAR $L$enc_key_ret @@ -4581,7 +2499,9 @@ $L$enc_key_ret: pxor xmm4,xmm4 pxor xmm5,xmm5 add rsp,8 + DB 0F3h,0C3h ;repret + $L$SEH_end_set_encrypt_key: ALIGN 16 @@ -4766,64 +2686,6 @@ ctr_xts_se_handler: -ALIGN 16 -ocb_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$ocb_no_xmm - - mov rax,QWORD[152+r8] - - lea rsi,[rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[((160+40))+rax] - -$L$ocb_no_xmm: - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - - jmp NEAR $L$common_seh_tail - - ALIGN 16 cbc_se_handler: push rsi @@ -4908,46 +2770,22 @@ $L$common_seh_tail: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_aesni_ecb_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_ecb_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_hw_ecb_encrypt wrt ..imagebase + DD $L$SEH_end_aes_hw_ecb_encrypt wrt ..imagebase DD $L$SEH_info_ecb wrt ..imagebase - DD $L$SEH_begin_aesni_ccm64_encrypt_blocks wrt ..imagebase - DD $L$SEH_end_aesni_ccm64_encrypt_blocks wrt ..imagebase - DD $L$SEH_info_ccm64_enc wrt ..imagebase - - DD $L$SEH_begin_aesni_ccm64_decrypt_blocks wrt ..imagebase - DD $L$SEH_end_aesni_ccm64_decrypt_blocks wrt ..imagebase - DD $L$SEH_info_ccm64_dec wrt ..imagebase - - DD $L$SEH_begin_aesni_ctr32_encrypt_blocks wrt ..imagebase - DD $L$SEH_end_aesni_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase DD $L$SEH_info_ctr32 wrt ..imagebase - - DD $L$SEH_begin_aesni_xts_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_xts_encrypt wrt ..imagebase - DD $L$SEH_info_xts_enc wrt ..imagebase - - DD $L$SEH_begin_aesni_xts_decrypt wrt ..imagebase - DD $L$SEH_end_aesni_xts_decrypt wrt ..imagebase - DD $L$SEH_info_xts_dec wrt ..imagebase - - DD $L$SEH_begin_aesni_ocb_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_ocb_encrypt wrt ..imagebase - DD $L$SEH_info_ocb_enc wrt ..imagebase - - DD $L$SEH_begin_aesni_ocb_decrypt wrt ..imagebase - DD $L$SEH_end_aesni_ocb_decrypt wrt ..imagebase - DD $L$SEH_info_ocb_dec wrt ..imagebase - DD $L$SEH_begin_aesni_cbc_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_aes_hw_cbc_encrypt wrt ..imagebase + DD $L$SEH_end_aes_hw_cbc_encrypt wrt ..imagebase DD $L$SEH_info_cbc wrt ..imagebase - DD aesni_set_decrypt_key wrt ..imagebase + DD aes_hw_set_decrypt_key wrt ..imagebase DD $L$SEH_end_set_decrypt_key wrt ..imagebase DD $L$SEH_info_key wrt ..imagebase - DD aesni_set_encrypt_key wrt ..imagebase + DD aes_hw_set_encrypt_key wrt ..imagebase DD $L$SEH_end_set_encrypt_key wrt ..imagebase DD $L$SEH_info_key wrt ..imagebase section .xdata rdata align=8 @@ -4956,38 +2794,10 @@ $L$SEH_info_ecb: DB 9,0,0,0 DD ecb_ccm64_se_handler wrt ..imagebase DD $L$ecb_enc_body wrt ..imagebase,$L$ecb_enc_ret wrt ..imagebase -$L$SEH_info_ccm64_enc: -DB 9,0,0,0 - DD ecb_ccm64_se_handler wrt ..imagebase - DD $L$ccm64_enc_body wrt ..imagebase,$L$ccm64_enc_ret wrt ..imagebase -$L$SEH_info_ccm64_dec: -DB 9,0,0,0 - DD ecb_ccm64_se_handler wrt ..imagebase - DD $L$ccm64_dec_body wrt ..imagebase,$L$ccm64_dec_ret wrt ..imagebase $L$SEH_info_ctr32: DB 9,0,0,0 DD ctr_xts_se_handler wrt ..imagebase DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase -$L$SEH_info_xts_enc: -DB 9,0,0,0 - DD ctr_xts_se_handler wrt ..imagebase - DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase -$L$SEH_info_xts_dec: -DB 9,0,0,0 - DD ctr_xts_se_handler wrt ..imagebase - DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase -$L$SEH_info_ocb_enc: -DB 9,0,0,0 - DD ocb_se_handler wrt ..imagebase - DD $L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase - DD $L$ocb_enc_pop wrt ..imagebase - DD 0 -$L$SEH_info_ocb_dec: -DB 9,0,0,0 - DD ocb_se_handler wrt ..imagebase - DD $L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase - DD $L$ocb_dec_pop wrt ..imagebase - DD 0 $L$SEH_info_cbc: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm deleted file mode 100644 index 9c6d129369..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm +++ /dev/null @@ -1,2744 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - - -EXTERN asm_AES_encrypt -EXTERN asm_AES_decrypt - - -ALIGN 64 -_bsaes_encrypt8: - lea r11,[$L$BS0] - - movdqa xmm8,XMMWORD[rax] - lea rax,[16+rax] - movdqa xmm7,XMMWORD[80+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm3 - psrlq xmm3,1 - pxor xmm5,xmm6 - pxor xmm3,xmm4 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm6,xmm5 - psllq xmm5,1 - pxor xmm4,xmm3 - psllq xmm3,1 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm1,xmm2 - pxor xmm15,xmm0 - pand xmm1,xmm7 - pand xmm15,xmm7 - pxor xmm2,xmm1 - psllq xmm1,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm1,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm4 - psrlq xmm4,2 - movdqa xmm10,xmm3 - psrlq xmm3,2 - pxor xmm4,xmm6 - pxor xmm3,xmm5 - pand xmm4,xmm8 - pand xmm3,xmm8 - pxor xmm6,xmm4 - psllq xmm4,2 - pxor xmm5,xmm3 - psllq xmm3,2 - pxor xmm4,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm2 - pxor xmm15,xmm1 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm2,xmm0 - psllq xmm0,2 - pxor xmm1,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm2 - psrlq xmm2,4 - movdqa xmm10,xmm1 - psrlq xmm1,4 - pxor xmm2,xmm6 - pxor xmm1,xmm5 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm6,xmm2 - psllq xmm2,4 - pxor xmm5,xmm1 - psllq xmm1,4 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm4 - pxor xmm15,xmm3 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm4,xmm0 - psllq xmm0,4 - pxor xmm3,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - dec r10d - jmp NEAR $L$enc_sbox -ALIGN 16 -$L$enc_loop: - pxor xmm15,XMMWORD[rax] - pxor xmm0,XMMWORD[16+rax] - pxor xmm1,XMMWORD[32+rax] - pxor xmm2,XMMWORD[48+rax] -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,XMMWORD[64+rax] - pxor xmm4,XMMWORD[80+rax] -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,XMMWORD[96+rax] - pxor xmm6,XMMWORD[112+rax] -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea rax,[128+rax] -$L$enc_sbox: - pxor xmm4,xmm5 - pxor xmm1,xmm0 - pxor xmm2,xmm15 - pxor xmm5,xmm1 - pxor xmm4,xmm15 - - pxor xmm5,xmm2 - pxor xmm2,xmm6 - pxor xmm6,xmm4 - pxor xmm2,xmm3 - pxor xmm3,xmm4 - pxor xmm2,xmm0 - - pxor xmm1,xmm6 - pxor xmm0,xmm4 - movdqa xmm10,xmm6 - movdqa xmm9,xmm0 - movdqa xmm8,xmm4 - movdqa xmm12,xmm1 - movdqa xmm11,xmm5 - - pxor xmm10,xmm3 - pxor xmm9,xmm1 - pxor xmm8,xmm2 - movdqa xmm13,xmm10 - pxor xmm12,xmm3 - movdqa xmm7,xmm9 - pxor xmm11,xmm15 - movdqa xmm14,xmm10 - - por xmm9,xmm8 - por xmm10,xmm11 - pxor xmm14,xmm7 - pand xmm13,xmm11 - pxor xmm11,xmm8 - pand xmm7,xmm8 - pand xmm14,xmm11 - movdqa xmm11,xmm2 - pxor xmm11,xmm15 - pand xmm12,xmm11 - pxor xmm10,xmm12 - pxor xmm9,xmm12 - movdqa xmm12,xmm6 - movdqa xmm11,xmm4 - pxor xmm12,xmm0 - pxor xmm11,xmm5 - movdqa xmm8,xmm12 - pand xmm12,xmm11 - por xmm8,xmm11 - pxor xmm7,xmm12 - pxor xmm10,xmm14 - pxor xmm9,xmm13 - pxor xmm8,xmm14 - movdqa xmm11,xmm1 - pxor xmm7,xmm13 - movdqa xmm12,xmm3 - pxor xmm8,xmm13 - movdqa xmm13,xmm0 - pand xmm11,xmm2 - movdqa xmm14,xmm6 - pand xmm12,xmm15 - pand xmm13,xmm4 - por xmm14,xmm5 - pxor xmm10,xmm11 - pxor xmm9,xmm12 - pxor xmm8,xmm13 - pxor xmm7,xmm14 - - - - - - movdqa xmm11,xmm10 - pand xmm10,xmm8 - pxor xmm11,xmm9 - - movdqa xmm13,xmm7 - movdqa xmm14,xmm11 - pxor xmm13,xmm10 - pand xmm14,xmm13 - - movdqa xmm12,xmm8 - pxor xmm14,xmm9 - pxor xmm12,xmm7 - - pxor xmm10,xmm9 - - pand xmm12,xmm10 - - movdqa xmm9,xmm13 - pxor xmm12,xmm7 - - pxor xmm9,xmm12 - pxor xmm8,xmm12 - - pand xmm9,xmm7 - - pxor xmm13,xmm9 - pxor xmm8,xmm9 - - pand xmm13,xmm14 - - pxor xmm13,xmm11 - movdqa xmm11,xmm5 - movdqa xmm7,xmm4 - movdqa xmm9,xmm14 - pxor xmm9,xmm13 - pand xmm9,xmm5 - pxor xmm5,xmm4 - pand xmm4,xmm14 - pand xmm5,xmm13 - pxor xmm5,xmm4 - pxor xmm4,xmm9 - pxor xmm11,xmm15 - pxor xmm7,xmm2 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm15 - pxor xmm11,xmm7 - pxor xmm15,xmm2 - pand xmm7,xmm14 - pand xmm2,xmm12 - pand xmm11,xmm13 - pand xmm15,xmm8 - pxor xmm7,xmm11 - pxor xmm15,xmm2 - pxor xmm11,xmm10 - pxor xmm2,xmm9 - pxor xmm5,xmm11 - pxor xmm15,xmm11 - pxor xmm4,xmm7 - pxor xmm2,xmm7 - - movdqa xmm11,xmm6 - movdqa xmm7,xmm0 - pxor xmm11,xmm3 - pxor xmm7,xmm1 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm3 - pxor xmm11,xmm7 - pxor xmm3,xmm1 - pand xmm7,xmm14 - pand xmm1,xmm12 - pand xmm11,xmm13 - pand xmm3,xmm8 - pxor xmm7,xmm11 - pxor xmm3,xmm1 - pxor xmm11,xmm10 - pxor xmm1,xmm9 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - pxor xmm10,xmm13 - pand xmm10,xmm6 - pxor xmm6,xmm0 - pand xmm0,xmm14 - pand xmm6,xmm13 - pxor xmm6,xmm0 - pxor xmm0,xmm10 - pxor xmm6,xmm11 - pxor xmm3,xmm11 - pxor xmm0,xmm7 - pxor xmm1,xmm7 - pxor xmm6,xmm15 - pxor xmm0,xmm5 - pxor xmm3,xmm6 - pxor xmm5,xmm15 - pxor xmm15,xmm0 - - pxor xmm0,xmm4 - pxor xmm4,xmm1 - pxor xmm1,xmm2 - pxor xmm2,xmm4 - pxor xmm3,xmm4 - - pxor xmm5,xmm2 - dec r10d - jl NEAR $L$enc_done - pshufd xmm7,xmm15,0x93 - pshufd xmm8,xmm0,0x93 - pxor xmm15,xmm7 - pshufd xmm9,xmm3,0x93 - pxor xmm0,xmm8 - pshufd xmm10,xmm5,0x93 - pxor xmm3,xmm9 - pshufd xmm11,xmm2,0x93 - pxor xmm5,xmm10 - pshufd xmm12,xmm6,0x93 - pxor xmm2,xmm11 - pshufd xmm13,xmm1,0x93 - pxor xmm6,xmm12 - pshufd xmm14,xmm4,0x93 - pxor xmm1,xmm13 - pxor xmm4,xmm14 - - pxor xmm8,xmm15 - pxor xmm7,xmm4 - pxor xmm8,xmm4 - pshufd xmm15,xmm15,0x4E - pxor xmm9,xmm0 - pshufd xmm0,xmm0,0x4E - pxor xmm12,xmm2 - pxor xmm15,xmm7 - pxor xmm13,xmm6 - pxor xmm0,xmm8 - pxor xmm11,xmm5 - pshufd xmm7,xmm2,0x4E - pxor xmm14,xmm1 - pshufd xmm8,xmm6,0x4E - pxor xmm10,xmm3 - pshufd xmm2,xmm5,0x4E - pxor xmm10,xmm4 - pshufd xmm6,xmm4,0x4E - pxor xmm11,xmm4 - pshufd xmm5,xmm1,0x4E - pxor xmm7,xmm11 - pshufd xmm1,xmm3,0x4E - pxor xmm8,xmm12 - pxor xmm2,xmm10 - pxor xmm6,xmm14 - pxor xmm5,xmm13 - movdqa xmm3,xmm7 - pxor xmm1,xmm9 - movdqa xmm4,xmm8 - movdqa xmm7,XMMWORD[48+r11] - jnz NEAR $L$enc_loop - movdqa xmm7,XMMWORD[64+r11] - jmp NEAR $L$enc_loop -ALIGN 16 -$L$enc_done: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm2 - psrlq xmm2,1 - pxor xmm1,xmm4 - pxor xmm2,xmm6 - pand xmm1,xmm7 - pand xmm2,xmm7 - pxor xmm4,xmm1 - psllq xmm1,1 - pxor xmm6,xmm2 - psllq xmm2,1 - pxor xmm1,xmm9 - pxor xmm2,xmm10 - movdqa xmm9,xmm3 - psrlq xmm3,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm3,xmm5 - pxor xmm15,xmm0 - pand xmm3,xmm7 - pand xmm15,xmm7 - pxor xmm5,xmm3 - psllq xmm3,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm3,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm6 - psrlq xmm6,2 - movdqa xmm10,xmm2 - psrlq xmm2,2 - pxor xmm6,xmm4 - pxor xmm2,xmm1 - pand xmm6,xmm8 - pand xmm2,xmm8 - pxor xmm4,xmm6 - psllq xmm6,2 - pxor xmm1,xmm2 - psllq xmm2,2 - pxor xmm6,xmm9 - pxor xmm2,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm5 - pxor xmm15,xmm3 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm5,xmm0 - psllq xmm0,2 - pxor xmm3,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm5 - psrlq xmm5,4 - movdqa xmm10,xmm3 - psrlq xmm3,4 - pxor xmm5,xmm4 - pxor xmm3,xmm1 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm4,xmm5 - psllq xmm5,4 - pxor xmm1,xmm3 - psllq xmm3,4 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm6 - pxor xmm15,xmm2 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm6,xmm0 - psllq xmm0,4 - pxor xmm2,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[rax] - pxor xmm3,xmm7 - pxor xmm5,xmm7 - pxor xmm2,xmm7 - pxor xmm6,xmm7 - pxor xmm1,xmm7 - pxor xmm4,xmm7 - pxor xmm15,xmm7 - pxor xmm0,xmm7 - DB 0F3h,0C3h ;repret - - - -ALIGN 64 -_bsaes_decrypt8: - lea r11,[$L$BS0] - - movdqa xmm8,XMMWORD[rax] - lea rax,[16+rax] - movdqa xmm7,XMMWORD[((-48))+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm3 - psrlq xmm3,1 - pxor xmm5,xmm6 - pxor xmm3,xmm4 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm6,xmm5 - psllq xmm5,1 - pxor xmm4,xmm3 - psllq xmm3,1 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm1,xmm2 - pxor xmm15,xmm0 - pand xmm1,xmm7 - pand xmm15,xmm7 - pxor xmm2,xmm1 - psllq xmm1,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm1,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm4 - psrlq xmm4,2 - movdqa xmm10,xmm3 - psrlq xmm3,2 - pxor xmm4,xmm6 - pxor xmm3,xmm5 - pand xmm4,xmm8 - pand xmm3,xmm8 - pxor xmm6,xmm4 - psllq xmm4,2 - pxor xmm5,xmm3 - psllq xmm3,2 - pxor xmm4,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm2 - pxor xmm15,xmm1 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm2,xmm0 - psllq xmm0,2 - pxor xmm1,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm2 - psrlq xmm2,4 - movdqa xmm10,xmm1 - psrlq xmm1,4 - pxor xmm2,xmm6 - pxor xmm1,xmm5 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm6,xmm2 - psllq xmm2,4 - pxor xmm5,xmm1 - psllq xmm1,4 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm4 - pxor xmm15,xmm3 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm4,xmm0 - psllq xmm0,4 - pxor xmm3,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - dec r10d - jmp NEAR $L$dec_sbox -ALIGN 16 -$L$dec_loop: - pxor xmm15,XMMWORD[rax] - pxor xmm0,XMMWORD[16+rax] - pxor xmm1,XMMWORD[32+rax] - pxor xmm2,XMMWORD[48+rax] -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,XMMWORD[64+rax] - pxor xmm4,XMMWORD[80+rax] -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,XMMWORD[96+rax] - pxor xmm6,XMMWORD[112+rax] -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea rax,[128+rax] -$L$dec_sbox: - pxor xmm2,xmm3 - - pxor xmm3,xmm6 - pxor xmm1,xmm6 - pxor xmm5,xmm3 - pxor xmm6,xmm5 - pxor xmm0,xmm6 - - pxor xmm15,xmm0 - pxor xmm1,xmm4 - pxor xmm2,xmm15 - pxor xmm4,xmm15 - pxor xmm0,xmm2 - movdqa xmm10,xmm2 - movdqa xmm9,xmm6 - movdqa xmm8,xmm0 - movdqa xmm12,xmm3 - movdqa xmm11,xmm4 - - pxor xmm10,xmm15 - pxor xmm9,xmm3 - pxor xmm8,xmm5 - movdqa xmm13,xmm10 - pxor xmm12,xmm15 - movdqa xmm7,xmm9 - pxor xmm11,xmm1 - movdqa xmm14,xmm10 - - por xmm9,xmm8 - por xmm10,xmm11 - pxor xmm14,xmm7 - pand xmm13,xmm11 - pxor xmm11,xmm8 - pand xmm7,xmm8 - pand xmm14,xmm11 - movdqa xmm11,xmm5 - pxor xmm11,xmm1 - pand xmm12,xmm11 - pxor xmm10,xmm12 - pxor xmm9,xmm12 - movdqa xmm12,xmm2 - movdqa xmm11,xmm0 - pxor xmm12,xmm6 - pxor xmm11,xmm4 - movdqa xmm8,xmm12 - pand xmm12,xmm11 - por xmm8,xmm11 - pxor xmm7,xmm12 - pxor xmm10,xmm14 - pxor xmm9,xmm13 - pxor xmm8,xmm14 - movdqa xmm11,xmm3 - pxor xmm7,xmm13 - movdqa xmm12,xmm15 - pxor xmm8,xmm13 - movdqa xmm13,xmm6 - pand xmm11,xmm5 - movdqa xmm14,xmm2 - pand xmm12,xmm1 - pand xmm13,xmm0 - por xmm14,xmm4 - pxor xmm10,xmm11 - pxor xmm9,xmm12 - pxor xmm8,xmm13 - pxor xmm7,xmm14 - - - - - - movdqa xmm11,xmm10 - pand xmm10,xmm8 - pxor xmm11,xmm9 - - movdqa xmm13,xmm7 - movdqa xmm14,xmm11 - pxor xmm13,xmm10 - pand xmm14,xmm13 - - movdqa xmm12,xmm8 - pxor xmm14,xmm9 - pxor xmm12,xmm7 - - pxor xmm10,xmm9 - - pand xmm12,xmm10 - - movdqa xmm9,xmm13 - pxor xmm12,xmm7 - - pxor xmm9,xmm12 - pxor xmm8,xmm12 - - pand xmm9,xmm7 - - pxor xmm13,xmm9 - pxor xmm8,xmm9 - - pand xmm13,xmm14 - - pxor xmm13,xmm11 - movdqa xmm11,xmm4 - movdqa xmm7,xmm0 - movdqa xmm9,xmm14 - pxor xmm9,xmm13 - pand xmm9,xmm4 - pxor xmm4,xmm0 - pand xmm0,xmm14 - pand xmm4,xmm13 - pxor xmm4,xmm0 - pxor xmm0,xmm9 - pxor xmm11,xmm1 - pxor xmm7,xmm5 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm1 - pxor xmm11,xmm7 - pxor xmm1,xmm5 - pand xmm7,xmm14 - pand xmm5,xmm12 - pand xmm11,xmm13 - pand xmm1,xmm8 - pxor xmm7,xmm11 - pxor xmm1,xmm5 - pxor xmm11,xmm10 - pxor xmm5,xmm9 - pxor xmm4,xmm11 - pxor xmm1,xmm11 - pxor xmm0,xmm7 - pxor xmm5,xmm7 - - movdqa xmm11,xmm2 - movdqa xmm7,xmm6 - pxor xmm11,xmm15 - pxor xmm7,xmm3 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm15 - pxor xmm11,xmm7 - pxor xmm15,xmm3 - pand xmm7,xmm14 - pand xmm3,xmm12 - pand xmm11,xmm13 - pand xmm15,xmm8 - pxor xmm7,xmm11 - pxor xmm15,xmm3 - pxor xmm11,xmm10 - pxor xmm3,xmm9 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - pxor xmm10,xmm13 - pand xmm10,xmm2 - pxor xmm2,xmm6 - pand xmm6,xmm14 - pand xmm2,xmm13 - pxor xmm2,xmm6 - pxor xmm6,xmm10 - pxor xmm2,xmm11 - pxor xmm15,xmm11 - pxor xmm6,xmm7 - pxor xmm3,xmm7 - pxor xmm0,xmm6 - pxor xmm5,xmm4 - - pxor xmm3,xmm0 - pxor xmm1,xmm6 - pxor xmm4,xmm6 - pxor xmm3,xmm1 - pxor xmm6,xmm15 - pxor xmm3,xmm4 - pxor xmm2,xmm5 - pxor xmm5,xmm0 - pxor xmm2,xmm3 - - pxor xmm3,xmm15 - pxor xmm6,xmm2 - dec r10d - jl NEAR $L$dec_done - - pshufd xmm7,xmm15,0x4E - pshufd xmm13,xmm2,0x4E - pxor xmm7,xmm15 - pshufd xmm14,xmm4,0x4E - pxor xmm13,xmm2 - pshufd xmm8,xmm0,0x4E - pxor xmm14,xmm4 - pshufd xmm9,xmm5,0x4E - pxor xmm8,xmm0 - pshufd xmm10,xmm3,0x4E - pxor xmm9,xmm5 - pxor xmm15,xmm13 - pxor xmm0,xmm13 - pshufd xmm11,xmm1,0x4E - pxor xmm10,xmm3 - pxor xmm5,xmm7 - pxor xmm3,xmm8 - pshufd xmm12,xmm6,0x4E - pxor xmm11,xmm1 - pxor xmm0,xmm14 - pxor xmm1,xmm9 - pxor xmm12,xmm6 - - pxor xmm5,xmm14 - pxor xmm3,xmm13 - pxor xmm1,xmm13 - pxor xmm6,xmm10 - pxor xmm2,xmm11 - pxor xmm1,xmm14 - pxor xmm6,xmm14 - pxor xmm4,xmm12 - pshufd xmm7,xmm15,0x93 - pshufd xmm8,xmm0,0x93 - pxor xmm15,xmm7 - pshufd xmm9,xmm5,0x93 - pxor xmm0,xmm8 - pshufd xmm10,xmm3,0x93 - pxor xmm5,xmm9 - pshufd xmm11,xmm1,0x93 - pxor xmm3,xmm10 - pshufd xmm12,xmm6,0x93 - pxor xmm1,xmm11 - pshufd xmm13,xmm2,0x93 - pxor xmm6,xmm12 - pshufd xmm14,xmm4,0x93 - pxor xmm2,xmm13 - pxor xmm4,xmm14 - - pxor xmm8,xmm15 - pxor xmm7,xmm4 - pxor xmm8,xmm4 - pshufd xmm15,xmm15,0x4E - pxor xmm9,xmm0 - pshufd xmm0,xmm0,0x4E - pxor xmm12,xmm1 - pxor xmm15,xmm7 - pxor xmm13,xmm6 - pxor xmm0,xmm8 - pxor xmm11,xmm3 - pshufd xmm7,xmm1,0x4E - pxor xmm14,xmm2 - pshufd xmm8,xmm6,0x4E - pxor xmm10,xmm5 - pshufd xmm1,xmm3,0x4E - pxor xmm10,xmm4 - pshufd xmm6,xmm4,0x4E - pxor xmm11,xmm4 - pshufd xmm3,xmm2,0x4E - pxor xmm7,xmm11 - pshufd xmm2,xmm5,0x4E - pxor xmm8,xmm12 - pxor xmm10,xmm1 - pxor xmm6,xmm14 - pxor xmm13,xmm3 - movdqa xmm3,xmm7 - pxor xmm2,xmm9 - movdqa xmm5,xmm13 - movdqa xmm4,xmm8 - movdqa xmm1,xmm2 - movdqa xmm2,xmm10 - movdqa xmm7,XMMWORD[((-16))+r11] - jnz NEAR $L$dec_loop - movdqa xmm7,XMMWORD[((-32))+r11] - jmp NEAR $L$dec_loop -ALIGN 16 -$L$dec_done: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm2 - psrlq xmm2,1 - movdqa xmm10,xmm1 - psrlq xmm1,1 - pxor xmm2,xmm4 - pxor xmm1,xmm6 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm4,xmm2 - psllq xmm2,1 - pxor xmm6,xmm1 - psllq xmm1,1 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm5,xmm3 - pxor xmm15,xmm0 - pand xmm5,xmm7 - pand xmm15,xmm7 - pxor xmm3,xmm5 - psllq xmm5,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm5,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm6 - psrlq xmm6,2 - movdqa xmm10,xmm1 - psrlq xmm1,2 - pxor xmm6,xmm4 - pxor xmm1,xmm2 - pand xmm6,xmm8 - pand xmm1,xmm8 - pxor xmm4,xmm6 - psllq xmm6,2 - pxor xmm2,xmm1 - psllq xmm1,2 - pxor xmm6,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm3 - pxor xmm15,xmm5 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm3,xmm0 - psllq xmm0,2 - pxor xmm5,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm3 - psrlq xmm3,4 - movdqa xmm10,xmm5 - psrlq xmm5,4 - pxor xmm3,xmm4 - pxor xmm5,xmm2 - pand xmm3,xmm7 - pand xmm5,xmm7 - pxor xmm4,xmm3 - psllq xmm3,4 - pxor xmm2,xmm5 - psllq xmm5,4 - pxor xmm3,xmm9 - pxor xmm5,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm6 - pxor xmm15,xmm1 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm6,xmm0 - psllq xmm0,4 - pxor xmm1,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[rax] - pxor xmm5,xmm7 - pxor xmm3,xmm7 - pxor xmm1,xmm7 - pxor xmm6,xmm7 - pxor xmm2,xmm7 - pxor xmm4,xmm7 - pxor xmm15,xmm7 - pxor xmm0,xmm7 - DB 0F3h,0C3h ;repret - - -ALIGN 16 -_bsaes_key_convert: - lea r11,[$L$masks] - movdqu xmm7,XMMWORD[rcx] - lea rcx,[16+rcx] - movdqa xmm0,XMMWORD[r11] - movdqa xmm1,XMMWORD[16+r11] - movdqa xmm2,XMMWORD[32+r11] - movdqa xmm3,XMMWORD[48+r11] - movdqa xmm4,XMMWORD[64+r11] - pcmpeqd xmm5,xmm5 - - movdqu xmm6,XMMWORD[rcx] - movdqa XMMWORD[rax],xmm7 - lea rax,[16+rax] - dec r10d - jmp NEAR $L$key_loop -ALIGN 16 -$L$key_loop: -DB 102,15,56,0,244 - - movdqa xmm8,xmm0 - movdqa xmm9,xmm1 - - pand xmm8,xmm6 - pand xmm9,xmm6 - movdqa xmm10,xmm2 - pcmpeqb xmm8,xmm0 - psllq xmm0,4 - movdqa xmm11,xmm3 - pcmpeqb xmm9,xmm1 - psllq xmm1,4 - - pand xmm10,xmm6 - pand xmm11,xmm6 - movdqa xmm12,xmm0 - pcmpeqb xmm10,xmm2 - psllq xmm2,4 - movdqa xmm13,xmm1 - pcmpeqb xmm11,xmm3 - psllq xmm3,4 - - movdqa xmm14,xmm2 - movdqa xmm15,xmm3 - pxor xmm8,xmm5 - pxor xmm9,xmm5 - - pand xmm12,xmm6 - pand xmm13,xmm6 - movdqa XMMWORD[rax],xmm8 - pcmpeqb xmm12,xmm0 - psrlq xmm0,4 - movdqa XMMWORD[16+rax],xmm9 - pcmpeqb xmm13,xmm1 - psrlq xmm1,4 - lea rcx,[16+rcx] - - pand xmm14,xmm6 - pand xmm15,xmm6 - movdqa XMMWORD[32+rax],xmm10 - pcmpeqb xmm14,xmm2 - psrlq xmm2,4 - movdqa XMMWORD[48+rax],xmm11 - pcmpeqb xmm15,xmm3 - psrlq xmm3,4 - movdqu xmm6,XMMWORD[rcx] - - pxor xmm13,xmm5 - pxor xmm14,xmm5 - movdqa XMMWORD[64+rax],xmm12 - movdqa XMMWORD[80+rax],xmm13 - movdqa XMMWORD[96+rax],xmm14 - movdqa XMMWORD[112+rax],xmm15 - lea rax,[128+rax] - dec r10d - jnz NEAR $L$key_loop - - movdqa xmm7,XMMWORD[80+r11] - - DB 0F3h,0C3h ;repret - -EXTERN asm_AES_cbc_encrypt -global bsaes_cbc_encrypt - -ALIGN 16 -bsaes_cbc_encrypt: - mov r11d,DWORD[48+rsp] - cmp r11d,0 - jne NEAR asm_AES_cbc_encrypt - cmp r8,128 - jb NEAR asm_AES_cbc_encrypt - - mov rax,rsp -$L$cbc_dec_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$cbc_dec_body: - mov rbp,rsp - mov eax,DWORD[240+r9] - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - mov rbx,r10 - shr r14,4 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,XMMWORD[rsp] - movdqa XMMWORD[rax],xmm6 - movdqa XMMWORD[rsp],xmm7 - - movdqu xmm14,XMMWORD[rbx] - sub r14,8 -$L$cbc_dec_loop: - movdqu xmm15,XMMWORD[r12] - movdqu xmm0,XMMWORD[16+r12] - movdqu xmm1,XMMWORD[32+r12] - movdqu xmm2,XMMWORD[48+r12] - movdqu xmm3,XMMWORD[64+r12] - movdqu xmm4,XMMWORD[80+r12] - mov rax,rsp - movdqu xmm5,XMMWORD[96+r12] - mov r10d,edx - movdqu xmm6,XMMWORD[112+r12] - movdqa XMMWORD[32+rbp],xmm14 - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm2,xmm12 - movdqu xmm14,XMMWORD[112+r12] - pxor xmm4,xmm13 - movdqu XMMWORD[r13],xmm15 - lea r12,[128+r12] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - sub r14,8 - jnc NEAR $L$cbc_dec_loop - - add r14,8 - jz NEAR $L$cbc_dec_done - - movdqu xmm15,XMMWORD[r12] - mov rax,rsp - mov r10d,edx - cmp r14,2 - jb NEAR $L$cbc_dec_one - movdqu xmm0,XMMWORD[16+r12] - je NEAR $L$cbc_dec_two - movdqu xmm1,XMMWORD[32+r12] - cmp r14,4 - jb NEAR $L$cbc_dec_three - movdqu xmm2,XMMWORD[48+r12] - je NEAR $L$cbc_dec_four - movdqu xmm3,XMMWORD[64+r12] - cmp r14,6 - jb NEAR $L$cbc_dec_five - movdqu xmm4,XMMWORD[80+r12] - je NEAR $L$cbc_dec_six - movdqu xmm5,XMMWORD[96+r12] - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu xmm14,XMMWORD[96+r12] - pxor xmm2,xmm12 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_six: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm14,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_five: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm14,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_four: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm14,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_three: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm14,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_two: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm14,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_one: - lea rcx,[r12] - lea rdx,[32+rbp] - lea r8,[r15] - call asm_AES_decrypt - pxor xmm14,XMMWORD[32+rbp] - movdqu XMMWORD[r13],xmm14 - movdqa xmm14,xmm15 - -$L$cbc_dec_done: - movdqu XMMWORD[rbx],xmm14 - lea rax,[rsp] - pxor xmm0,xmm0 -$L$cbc_dec_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$cbc_dec_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$cbc_dec_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$cbc_dec_epilogue: - DB 0F3h,0C3h ;repret - - -global bsaes_ctr32_encrypt_blocks - -ALIGN 16 -bsaes_ctr32_encrypt_blocks: - mov rax,rsp -$L$ctr_enc_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$ctr_enc_body: - mov rbp,rsp - movdqu xmm0,XMMWORD[r10] - mov eax,DWORD[240+r9] - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - movdqa XMMWORD[32+rbp],xmm0 - cmp r8,8 - jb NEAR $L$ctr_enc_short - - mov ebx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,ebx - call _bsaes_key_convert - pxor xmm7,xmm6 - movdqa XMMWORD[rax],xmm7 - - movdqa xmm8,XMMWORD[rsp] - lea r11,[$L$ADD1] - movdqa xmm15,XMMWORD[32+rbp] - movdqa xmm7,XMMWORD[((-32))+r11] -DB 102,68,15,56,0,199 -DB 102,68,15,56,0,255 - movdqa XMMWORD[rsp],xmm8 - jmp NEAR $L$ctr_enc_loop -ALIGN 16 -$L$ctr_enc_loop: - movdqa XMMWORD[32+rbp],xmm15 - movdqa xmm0,xmm15 - movdqa xmm1,xmm15 - paddd xmm0,XMMWORD[r11] - movdqa xmm2,xmm15 - paddd xmm1,XMMWORD[16+r11] - movdqa xmm3,xmm15 - paddd xmm2,XMMWORD[32+r11] - movdqa xmm4,xmm15 - paddd xmm3,XMMWORD[48+r11] - movdqa xmm5,xmm15 - paddd xmm4,XMMWORD[64+r11] - movdqa xmm6,xmm15 - paddd xmm5,XMMWORD[80+r11] - paddd xmm6,XMMWORD[96+r11] - - - - movdqa xmm8,XMMWORD[rsp] - lea rax,[16+rsp] - movdqa xmm7,XMMWORD[((-16))+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea r11,[$L$BS0] - mov r10d,ebx - - call _bsaes_encrypt8_bitslice - - sub r14,8 - jc NEAR $L$ctr_enc_loop_done - - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - movdqu xmm9,XMMWORD[32+r12] - movdqu xmm10,XMMWORD[48+r12] - movdqu xmm11,XMMWORD[64+r12] - movdqu xmm12,XMMWORD[80+r12] - movdqu xmm13,XMMWORD[96+r12] - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - pxor xmm7,xmm15 - movdqa xmm15,XMMWORD[32+rbp] - pxor xmm0,xmm8 - movdqu XMMWORD[r13],xmm7 - pxor xmm3,xmm9 - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,xmm10 - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,xmm11 - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,xmm12 - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,xmm13 - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,xmm14 - movdqu XMMWORD[96+r13],xmm1 - lea r11,[$L$ADD1] - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - paddd xmm15,XMMWORD[112+r11] - jnz NEAR $L$ctr_enc_loop - - jmp NEAR $L$ctr_enc_done -ALIGN 16 -$L$ctr_enc_loop_done: - add r14,8 - movdqu xmm7,XMMWORD[r12] - pxor xmm15,xmm7 - movdqu XMMWORD[r13],xmm15 - cmp r14,2 - jb NEAR $L$ctr_enc_done - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm8 - movdqu XMMWORD[16+r13],xmm0 - je NEAR $L$ctr_enc_done - movdqu xmm9,XMMWORD[32+r12] - pxor xmm3,xmm9 - movdqu XMMWORD[32+r13],xmm3 - cmp r14,4 - jb NEAR $L$ctr_enc_done - movdqu xmm10,XMMWORD[48+r12] - pxor xmm5,xmm10 - movdqu XMMWORD[48+r13],xmm5 - je NEAR $L$ctr_enc_done - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm11 - movdqu XMMWORD[64+r13],xmm2 - cmp r14,6 - jb NEAR $L$ctr_enc_done - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm12 - movdqu XMMWORD[80+r13],xmm6 - je NEAR $L$ctr_enc_done - movdqu xmm13,XMMWORD[96+r12] - pxor xmm1,xmm13 - movdqu XMMWORD[96+r13],xmm1 - jmp NEAR $L$ctr_enc_done - -ALIGN 16 -$L$ctr_enc_short: - lea rcx,[32+rbp] - lea rdx,[48+rbp] - lea r8,[r15] - call asm_AES_encrypt - movdqu xmm0,XMMWORD[r12] - lea r12,[16+r12] - mov eax,DWORD[44+rbp] - bswap eax - pxor xmm0,XMMWORD[48+rbp] - inc eax - movdqu XMMWORD[r13],xmm0 - bswap eax - lea r13,[16+r13] - mov DWORD[44+rsp],eax - dec r14 - jnz NEAR $L$ctr_enc_short - -$L$ctr_enc_done: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$ctr_enc_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$ctr_enc_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$ctr_enc_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$ctr_enc_epilogue: - DB 0F3h,0C3h ;repret - -global bsaes_xts_encrypt - -ALIGN 16 -bsaes_xts_encrypt: - mov rax,rsp -$L$xts_enc_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - mov r11,QWORD[168+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$xts_enc_body: - mov rbp,rsp - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - - lea rcx,[r11] - lea rdx,[32+rbp] - lea r8,[r10] - call asm_AES_encrypt - - mov eax,DWORD[240+r15] - mov rbx,r14 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,xmm6 - movdqa XMMWORD[rax],xmm7 - - and r14,-16 - sub rsp,0x80 - movdqa xmm6,XMMWORD[32+rbp] - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - - sub r14,0x80 - jc NEAR $L$xts_enc_short - jmp NEAR $L$xts_enc_loop - -ALIGN 16 -$L$xts_enc_loop: - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - movdqa XMMWORD[112+rsp],xmm6 - pxor xmm5,xmm13 - lea rax,[128+rsp] - pxor xmm6,xmm14 - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,XMMWORD[112+rsp] - movdqu XMMWORD[96+r13],xmm1 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - - movdqa xmm6,XMMWORD[112+rsp] - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - - sub r14,0x80 - jnc NEAR $L$xts_enc_loop - -$L$xts_enc_short: - add r14,0x80 - jz NEAR $L$xts_enc_done - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - cmp r14,16 - je NEAR $L$xts_enc_1 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - cmp r14,32 - je NEAR $L$xts_enc_2 - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - cmp r14,48 - je NEAR $L$xts_enc_3 - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - cmp r14,64 - je NEAR $L$xts_enc_4 - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - cmp r14,80 - je NEAR $L$xts_enc_5 - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - cmp r14,96 - je NEAR $L$xts_enc_6 - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqa XMMWORD[112+rsp],xmm6 - lea r12,[112+r12] - pxor xmm5,xmm13 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm1 - lea r13,[112+r13] - - movdqa xmm6,XMMWORD[112+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_6: - pxor xmm3,xmm11 - lea r12,[96+r12] - pxor xmm4,xmm12 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - movdqu XMMWORD[80+r13],xmm6 - lea r13,[96+r13] - - movdqa xmm6,XMMWORD[96+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_5: - pxor xmm2,xmm10 - lea r12,[80+r12] - pxor xmm3,xmm11 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - movdqu XMMWORD[64+r13],xmm2 - lea r13,[80+r13] - - movdqa xmm6,XMMWORD[80+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_4: - pxor xmm1,xmm9 - lea r12,[64+r12] - pxor xmm2,xmm10 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - movdqu XMMWORD[48+r13],xmm5 - lea r13,[64+r13] - - movdqa xmm6,XMMWORD[64+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_3: - pxor xmm0,xmm8 - lea r12,[48+r12] - pxor xmm1,xmm9 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm3 - lea r13,[48+r13] - - movdqa xmm6,XMMWORD[48+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_2: - pxor xmm15,xmm7 - lea r12,[32+r12] - pxor xmm0,xmm8 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - lea r13,[32+r13] - - movdqa xmm6,XMMWORD[32+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_1: - pxor xmm7,xmm15 - lea r12,[16+r12] - movdqa XMMWORD[32+rbp],xmm7 - lea rcx,[32+rbp] - lea rdx,[32+rbp] - lea r8,[r15] - call asm_AES_encrypt - pxor xmm15,XMMWORD[32+rbp] - - - - - - movdqu XMMWORD[r13],xmm15 - lea r13,[16+r13] - - movdqa xmm6,XMMWORD[16+rsp] - -$L$xts_enc_done: - and ebx,15 - jz NEAR $L$xts_enc_ret - mov rdx,r13 - -$L$xts_enc_steal: - movzx eax,BYTE[r12] - movzx ecx,BYTE[((-16))+rdx] - lea r12,[1+r12] - mov BYTE[((-16))+rdx],al - mov BYTE[rdx],cl - lea rdx,[1+rdx] - sub ebx,1 - jnz NEAR $L$xts_enc_steal - - movdqu xmm15,XMMWORD[((-16))+r13] - lea rcx,[32+rbp] - pxor xmm15,xmm6 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call asm_AES_encrypt - pxor xmm6,XMMWORD[32+rbp] - movdqu XMMWORD[(-16)+r13],xmm6 - -$L$xts_enc_ret: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$xts_enc_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$xts_enc_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$xts_enc_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$xts_enc_epilogue: - DB 0F3h,0C3h ;repret - - -global bsaes_xts_decrypt - -ALIGN 16 -bsaes_xts_decrypt: - mov rax,rsp -$L$xts_dec_prologue: - push rbp - push rbx - push r12 - push r13 - push r14 - push r15 - lea rsp,[((-72))+rsp] - mov r10,QWORD[160+rsp] - mov r11,QWORD[168+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$xts_dec_body: - mov rbp,rsp - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - - lea rcx,[r11] - lea rdx,[32+rbp] - lea r8,[r10] - call asm_AES_encrypt - - mov eax,DWORD[240+r15] - mov rbx,r14 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,XMMWORD[rsp] - movdqa XMMWORD[rax],xmm6 - movdqa XMMWORD[rsp],xmm7 - - xor eax,eax - and r14,-16 - test ebx,15 - setnz al - shl rax,4 - sub r14,rax - - sub rsp,0x80 - movdqa xmm6,XMMWORD[32+rbp] - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - - sub r14,0x80 - jc NEAR $L$xts_dec_short - jmp NEAR $L$xts_dec_loop - -ALIGN 16 -$L$xts_dec_loop: - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - movdqa XMMWORD[112+rsp],xmm6 - pxor xmm5,xmm13 - lea rax,[128+rsp] - pxor xmm6,xmm14 - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - pxor xmm2,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,XMMWORD[112+rsp] - movdqu XMMWORD[96+r13],xmm2 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - - movdqa xmm6,XMMWORD[112+rsp] - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - - sub r14,0x80 - jnc NEAR $L$xts_dec_loop - -$L$xts_dec_short: - add r14,0x80 - jz NEAR $L$xts_dec_done - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - cmp r14,16 - je NEAR $L$xts_dec_1 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - cmp r14,32 - je NEAR $L$xts_dec_2 - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - cmp r14,48 - je NEAR $L$xts_dec_3 - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - cmp r14,64 - je NEAR $L$xts_dec_4 - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - cmp r14,80 - je NEAR $L$xts_dec_5 - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - cmp r14,96 - je NEAR $L$xts_dec_6 - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqa XMMWORD[112+rsp],xmm6 - lea r12,[112+r12] - pxor xmm5,xmm13 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - pxor xmm2,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - lea r13,[112+r13] - - movdqa xmm6,XMMWORD[112+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_6: - pxor xmm3,xmm11 - lea r12,[96+r12] - pxor xmm4,xmm12 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - lea r13,[96+r13] - - movdqa xmm6,XMMWORD[96+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_5: - pxor xmm2,xmm10 - lea r12,[80+r12] - pxor xmm3,xmm11 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - lea r13,[80+r13] - - movdqa xmm6,XMMWORD[80+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_4: - pxor xmm1,xmm9 - lea r12,[64+r12] - pxor xmm2,xmm10 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - lea r13,[64+r13] - - movdqa xmm6,XMMWORD[64+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_3: - pxor xmm0,xmm8 - lea r12,[48+r12] - pxor xmm1,xmm9 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - lea r13,[48+r13] - - movdqa xmm6,XMMWORD[48+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_2: - pxor xmm15,xmm7 - lea r12,[32+r12] - pxor xmm0,xmm8 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - lea r13,[32+r13] - - movdqa xmm6,XMMWORD[32+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_1: - pxor xmm7,xmm15 - lea r12,[16+r12] - movdqa XMMWORD[32+rbp],xmm7 - lea rcx,[32+rbp] - lea rdx,[32+rbp] - lea r8,[r15] - call asm_AES_decrypt - pxor xmm15,XMMWORD[32+rbp] - - - - - - movdqu XMMWORD[r13],xmm15 - lea r13,[16+r13] - - movdqa xmm6,XMMWORD[16+rsp] - -$L$xts_dec_done: - and ebx,15 - jz NEAR $L$xts_dec_ret - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - movdqa xmm5,xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - movdqu xmm15,XMMWORD[r12] - pxor xmm6,xmm13 - - lea rcx,[32+rbp] - pxor xmm15,xmm6 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call asm_AES_decrypt - pxor xmm6,XMMWORD[32+rbp] - mov rdx,r13 - movdqu XMMWORD[r13],xmm6 - -$L$xts_dec_steal: - movzx eax,BYTE[16+r12] - movzx ecx,BYTE[rdx] - lea r12,[1+r12] - mov BYTE[rdx],al - mov BYTE[16+rdx],cl - lea rdx,[1+rdx] - sub ebx,1 - jnz NEAR $L$xts_dec_steal - - movdqu xmm15,XMMWORD[r13] - lea rcx,[32+rbp] - pxor xmm15,xmm5 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call asm_AES_decrypt - pxor xmm5,XMMWORD[32+rbp] - movdqu XMMWORD[r13],xmm5 - -$L$xts_dec_ret: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$xts_dec_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$xts_dec_bzero - - lea rax,[120+rbp] - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$xts_dec_tail: - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbx,QWORD[((-16))+rax] - mov rbp,QWORD[((-8))+rax] - lea rsp,[rax] -$L$xts_dec_epilogue: - DB 0F3h,0C3h ;repret - - -ALIGN 64 -_bsaes_const: -$L$M0ISR: - DQ 0x0a0e0206070b0f03,0x0004080c0d010509 -$L$ISRM0: - DQ 0x01040b0e0205080f,0x0306090c00070a0d -$L$ISR: - DQ 0x0504070602010003,0x0f0e0d0c080b0a09 -$L$BS0: - DQ 0x5555555555555555,0x5555555555555555 -$L$BS1: - DQ 0x3333333333333333,0x3333333333333333 -$L$BS2: - DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f -$L$SR: - DQ 0x0504070600030201,0x0f0e0d0c0a09080b -$L$SRM0: - DQ 0x0304090e00050a0f,0x01060b0c0207080d -$L$M0SR: - DQ 0x0a0e02060f03070b,0x0004080c05090d01 -$L$SWPUP: - DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 -$L$SWPUPM0SR: - DQ 0x0a0d02060c03070b,0x0004080f05090e01 -$L$ADD1: - DQ 0x0000000000000000,0x0000000100000000 -$L$ADD2: - DQ 0x0000000000000000,0x0000000200000000 -$L$ADD3: - DQ 0x0000000000000000,0x0000000300000000 -$L$ADD4: - DQ 0x0000000000000000,0x0000000400000000 -$L$ADD5: - DQ 0x0000000000000000,0x0000000500000000 -$L$ADD6: - DQ 0x0000000000000000,0x0000000600000000 -$L$ADD7: - DQ 0x0000000000000000,0x0000000700000000 -$L$ADD8: - DQ 0x0000000000000000,0x0000000800000000 -$L$xts_magic: - DD 0x87,0,1,0 -$L$masks: - DQ 0x0101010101010101,0x0101010101010101 - DQ 0x0202020202020202,0x0202020202020202 - DQ 0x0404040404040404,0x0404040404040404 - DQ 0x0808080808080808,0x0808080808080808 -$L$M0: - DQ 0x02060a0e03070b0f,0x0004080c0105090d -$L$63: - DQ 0x6363636363636363,0x6363636363636363 -DB 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102 -DB 111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44 -DB 32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44 -DB 32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32 -DB 65,110,100,121,32,80,111,108,121,97,107,111,118,0 -ALIGN 64 - -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jbe NEAR $L$in_prologue - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_tail - - mov rax,QWORD[160+r8] - - lea rsi,[64+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[((160+120))+rax] - -$L$in_tail: - mov rbp,QWORD[((-48))+rax] - mov rbx,QWORD[((-40))+rax] - mov r12,QWORD[((-32))+rax] - mov r13,QWORD[((-24))+rax] - mov r14,QWORD[((-16))+rax] - mov r15,QWORD[((-8))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$in_prologue: - mov QWORD[152+r8],rax - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$cbc_dec_prologue wrt ..imagebase - DD $L$cbc_dec_epilogue wrt ..imagebase - DD $L$cbc_dec_info wrt ..imagebase - - DD $L$ctr_enc_prologue wrt ..imagebase - DD $L$ctr_enc_epilogue wrt ..imagebase - DD $L$ctr_enc_info wrt ..imagebase - - DD $L$xts_enc_prologue wrt ..imagebase - DD $L$xts_enc_epilogue wrt ..imagebase - DD $L$xts_enc_info wrt ..imagebase - - DD $L$xts_dec_prologue wrt ..imagebase - DD $L$xts_dec_epilogue wrt ..imagebase - DD $L$xts_dec_info wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$cbc_dec_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase - DD $L$cbc_dec_tail wrt ..imagebase - DD 0 -$L$ctr_enc_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase - DD $L$ctr_enc_tail wrt ..imagebase - DD 0 -$L$xts_enc_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase - DD $L$xts_enc_tail wrt ..imagebase - DD 0 -$L$xts_dec_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase - DD $L$xts_dec_tail wrt ..imagebase - DD 0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm new file mode 100644 index 0000000000..434ba10ed6 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm @@ -0,0 +1,495 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + + + + + +global gcm_gmult_ssse3 +ALIGN 16 +gcm_gmult_ssse3: + +$L$gmult_seh_begin: + sub rsp,40 +$L$gmult_seh_allocstack: + movdqa XMMWORD[rsp],xmm6 +$L$gmult_seh_save_xmm6: + movdqa XMMWORD[16+rsp],xmm10 +$L$gmult_seh_save_xmm10: +$L$gmult_seh_prolog_end: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm10,XMMWORD[$L$reverse_bytes] + movdqa xmm2,XMMWORD[$L$low4_mask] + + +DB 102,65,15,56,0,194 + + + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + + + + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_1: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_1 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_2: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_2 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,6 +$L$oop_row_3: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_3 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + +DB 102,65,15,56,0,210 + movdqu XMMWORD[rcx],xmm2 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm10,XMMWORD[16+rsp] + add rsp,40 + DB 0F3h,0C3h ;repret +$L$gmult_seh_end: + + + + + + + + +global gcm_ghash_ssse3 +ALIGN 16 +gcm_ghash_ssse3: +$L$ghash_seh_begin: + + sub rsp,56 +$L$ghash_seh_allocstack: + movdqa XMMWORD[rsp],xmm6 +$L$ghash_seh_save_xmm6: + movdqa XMMWORD[16+rsp],xmm10 +$L$ghash_seh_save_xmm10: + movdqa XMMWORD[32+rsp],xmm11 +$L$ghash_seh_save_xmm11: +$L$ghash_seh_prolog_end: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm10,XMMWORD[$L$reverse_bytes] + movdqa xmm11,XMMWORD[$L$low4_mask] + + + and r9,-16 + + + +DB 102,65,15,56,0,194 + + + pxor xmm3,xmm3 +$L$oop_ghash: + + movdqu xmm1,XMMWORD[r8] +DB 102,65,15,56,0,202 + pxor xmm0,xmm1 + + + movdqa xmm1,xmm11 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm11 + + + + + pxor xmm2,xmm2 + + mov rax,5 +$L$oop_row_4: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_4 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_5: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_5 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,6 +$L$oop_row_6: + movdqa xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 +DB 102,15,58,15,243,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 +DB 102,15,56,0,224 +DB 102,15,56,0,233 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_6 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + movdqa xmm0,xmm2 + + + lea rdx,[((-256))+rdx] + + + lea r8,[16+r8] + sub r9,16 + jnz NEAR $L$oop_ghash + + +DB 102,65,15,56,0,194 + movdqu XMMWORD[rcx],xmm0 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm10,XMMWORD[16+rsp] + movdqa xmm11,XMMWORD[32+rsp] + add rsp,56 + DB 0F3h,0C3h ;repret +$L$ghash_seh_end: + + + +ALIGN 16 + + +$L$reverse_bytes: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + +$L$low4_mask: + DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f +section .pdata rdata align=4 +ALIGN 4 + DD $L$gmult_seh_begin wrt ..imagebase + DD $L$gmult_seh_end wrt ..imagebase + DD $L$gmult_seh_info wrt ..imagebase + + DD $L$ghash_seh_begin wrt ..imagebase + DD $L$ghash_seh_end wrt ..imagebase + DD $L$ghash_seh_info wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$gmult_seh_info: +DB 1 +DB $L$gmult_seh_prolog_end-$L$gmult_seh_begin +DB 5 +DB 0 + +DB $L$gmult_seh_save_xmm10-$L$gmult_seh_begin +DB 168 + DW 1 + +DB $L$gmult_seh_save_xmm6-$L$gmult_seh_begin +DB 104 + DW 0 + +DB $L$gmult_seh_allocstack-$L$gmult_seh_begin +DB 66 + +ALIGN 8 +$L$ghash_seh_info: +DB 1 +DB $L$ghash_seh_prolog_end-$L$ghash_seh_begin +DB 7 +DB 0 + +DB $L$ghash_seh_save_xmm11-$L$ghash_seh_begin +DB 184 + DW 2 + +DB $L$ghash_seh_save_xmm10-$L$ghash_seh_begin +DB 168 + DW 1 + +DB $L$ghash_seh_save_xmm6-$L$ghash_seh_begin +DB 104 + DW 0 + +DB $L$ghash_seh_allocstack-$L$ghash_seh_begin +DB 98 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm index 8ef16f513d..fdf914f284 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P @@ -18,13 +25,21 @@ $L$SEH_begin_gcm_gmult_4bit: mov rsi,rdx + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,280 + $L$gmult_prologue: movzx r8,BYTE[15+rdi] @@ -102,12 +117,16 @@ $L$break1: mov QWORD[rdi],r9 lea rsi,[((280+48))+rsp] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$gmult_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_gcm_gmult_4bit: global gcm_ghash_4bit @@ -123,13 +142,21 @@ $L$SEH_begin_gcm_ghash_4bit: mov rcx,r9 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,280 + $L$ghash_prologue: mov r14,rdx mov r15,rcx @@ -675,22 +702,32 @@ $L$outer_loop: mov QWORD[rdi],r9 lea rsi,[((280+48))+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$ghash_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_gcm_ghash_4bit: global gcm_init_clmul ALIGN 16 gcm_init_clmul: + $L$_init_clmul: $L$SEH_begin_gcm_init_clmul: @@ -850,10 +887,12 @@ DB 102,15,58,15,227,8 $L$SEH_end_gcm_init_clmul: DB 0F3h,0C3h ;repret + global gcm_gmult_clmul ALIGN 16 gcm_gmult_clmul: + $L$_gmult_clmul: movdqu xmm0,XMMWORD[rcx] movdqa xmm5,XMMWORD[$L$bswap_mask] @@ -901,10 +940,12 @@ DB 102,15,56,0,197 movdqu XMMWORD[rcx],xmm0 DB 0F3h,0C3h ;repret + global gcm_ghash_clmul ALIGN 32 gcm_ghash_clmul: + $L$_ghash_clmul: lea rax,[((-136))+rsp] $L$SEH_begin_gcm_ghash_clmul: @@ -1311,10 +1352,12 @@ DB 102,65,15,56,0,194 $L$SEH_end_gcm_ghash_clmul: DB 0F3h,0C3h ;repret + global gcm_init_avx ALIGN 32 gcm_init_avx: + $L$SEH_begin_gcm_init_avx: DB 0x48,0x83,0xec,0x18 @@ -1425,16 +1468,20 @@ $L$init_start_avx: $L$SEH_end_gcm_init_avx: DB 0F3h,0C3h ;repret + global gcm_gmult_avx ALIGN 32 gcm_gmult_avx: + jmp NEAR $L$_gmult_clmul + global gcm_ghash_avx ALIGN 32 gcm_ghash_avx: + lea rax,[((-136))+rsp] $L$SEH_begin_gcm_ghash_avx: @@ -1833,6 +1880,7 @@ $L$tail_no_xor_avx: $L$SEH_end_gcm_ghash_avx: DB 0F3h,0C3h ;repret + ALIGN 64 $L$bswap_mask: DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm index 0e9d2c604e..646201bb58 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 ALIGN 16 @@ -18,11 +25,17 @@ $L$SEH_begin_md5_block_asm_data_order: mov rdx,r8 + push rbp + push rbx + push r12 + push r14 + push r15 + $L$prologue: @@ -672,15 +685,22 @@ $L$end: mov DWORD[12+rbp],edx mov r15,QWORD[rsp] + mov r14,QWORD[8+rsp] + mov r12,QWORD[16+rsp] + mov rbx,QWORD[24+rsp] + mov rbp,QWORD[32+rsp] + add rsp,40 + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_md5_block_asm_data_order: EXTERN __imp_RtlVirtualUnwind diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm index 64db9d9518..215f5d2a49 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P @@ -21,6 +28,12 @@ $L$ONE_mont: DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +$L$ord: + DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +$L$ordK: + DQ 0xccd1c8aaee00bc4f + + global ecp_nistz256_neg @@ -34,9 +47,13 @@ $L$SEH_begin_ecp_nistz256_neg: mov rsi,rdx + push r12 + push r13 +$L$neg_body: + xor r8,r8 xor r9,r9 xor r10,r10 @@ -69,11 +86,17 @@ $L$SEH_begin_ecp_nistz256_neg: mov QWORD[16+rdi],r10 mov QWORD[24+rdi],r11 - pop r13 - pop r12 + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$neg_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_neg: @@ -81,6 +104,1117 @@ $L$SEH_end_ecp_nistz256_neg: +global ecp_nistz256_ord_mul_mont + +ALIGN 32 +ecp_nistz256_ord_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_mul_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mul_body: + + mov rax,QWORD[rdx] + mov rbx,rdx + lea r14,[$L$ord] + mov r15,QWORD[$L$ordK] + + + mov rcx,rax + mul QWORD[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD[8+rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov r10,rdx + + mul QWORD[16+rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + + mov r13,r8 + imul r8,r15 + + mov r11,rdx + mul QWORD[24+rsi] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r12,rdx + + + mul QWORD[r14] + mov rbp,r8 + add r13,rax + mov rax,r8 + adc rdx,0 + mov rcx,rdx + + sub r10,r8 + sbb r8,0 + + mul QWORD[8+r14] + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,rbp + adc r10,rdx + mov rdx,rbp + adc r8,0 + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[8+rbx] + sbb rbp,rdx + + add r11,r8 + adc r12,rbp + adc r13,0 + + + mov rcx,rax + mul QWORD[rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r10,rbp + adc rdx,0 + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r9 + imul r9,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r12,rbp + adc rdx,0 + xor r8,r8 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + mul QWORD[r14] + mov rbp,r9 + add rcx,rax + mov rax,r9 + adc rcx,rdx + + sub r11,r9 + sbb r9,0 + + mul QWORD[8+r14] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc r11,rdx + mov rdx,rbp + adc r9,0 + + shl rax,32 + shr rdx,32 + sub r12,rax + mov rax,QWORD[16+rbx] + sbb rbp,rdx + + add r12,r9 + adc r13,rbp + adc r8,0 + + + mov rcx,rax + mul QWORD[rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r10 + imul r10,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r13,rbp + adc rdx,0 + xor r9,r9 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + mul QWORD[r14] + mov rbp,r10 + add rcx,rax + mov rax,r10 + adc rcx,rdx + + sub r12,r10 + sbb r10,0 + + mul QWORD[8+r14] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc r12,rdx + mov rdx,rbp + adc r10,0 + + shl rax,32 + shr rdx,32 + sub r13,rax + mov rax,QWORD[24+rbx] + sbb rbp,rdx + + add r13,r10 + adc r8,rbp + adc r9,0 + + + mov rcx,rax + mul QWORD[rsi] + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r11 + imul r11,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r8,rbp + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + mul QWORD[r14] + mov rbp,r11 + add rcx,rax + mov rax,r11 + adc rcx,rdx + + sub r13,r11 + sbb r11,0 + + mul QWORD[8+r14] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc r13,rdx + mov rdx,rbp + adc r11,0 + + shl rax,32 + shr rdx,32 + sub r8,rax + sbb rbp,rdx + + add r8,r11 + adc r9,rbp + adc r10,0 + + + mov rsi,r12 + sub r12,QWORD[r14] + mov r11,r13 + sbb r13,QWORD[8+r14] + mov rcx,r8 + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rsi + cmovc r13,r11 + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_mont: + + + + + + + +global ecp_nistz256_ord_sqr_mont + +ALIGN 32 +ecp_nistz256_ord_sqr_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_sqr_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqr_body: + + mov r8,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + lea rsi,[$L$ord] + mov rbx,rdx + jmp NEAR $L$oop_ord_sqr + +ALIGN 32 +$L$oop_ord_sqr: + + mov rbp,rax + mul r8 + mov r9,rax +DB 102,72,15,110,205 + mov rax,r14 + mov r10,rdx + + mul r8 + add r10,rax + mov rax,r15 +DB 102,73,15,110,214 + adc rdx,0 + mov r11,rdx + + mul r8 + add r11,rax + mov rax,r15 +DB 102,73,15,110,223 + adc rdx,0 + mov r12,rdx + + + mul r14 + mov r13,rax + mov rax,r14 + mov r14,rdx + + + mul rbp + add r11,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + + mul rbp + add r12,rax + adc rdx,0 + + add r12,r15 + adc r13,rdx + adc r14,0 + + + xor r15,r15 + mov rax,r8 + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + + mul rax + mov r8,rax +DB 102,72,15,126,200 + mov rbp,rdx + + mul rax + add r9,rbp + adc r10,rax +DB 102,72,15,126,208 + adc rdx,0 + mov rbp,rdx + + mul rax + add r11,rbp + adc r12,rax +DB 102,72,15,126,216 + adc rdx,0 + mov rbp,rdx + + mov rcx,r8 + imul r8,QWORD[32+rsi] + + mul rax + add r13,rbp + adc r14,rax + mov rax,QWORD[rsi] + adc r15,rdx + + + mul r8 + mov rbp,r8 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r10,r8 + sbb rbp,0 + + mul r8 + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,r8 + adc r10,rdx + mov rdx,r8 + adc rbp,0 + + mov rcx,r9 + imul r9,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[rsi] + sbb r8,rdx + + add r11,rbp + adc r8,0 + + + mul r9 + mov rbp,r9 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r11,r9 + sbb rbp,0 + + mul r9 + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,r9 + adc r11,rdx + mov rdx,r9 + adc rbp,0 + + mov rcx,r10 + imul r10,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r8,rax + mov rax,QWORD[rsi] + sbb r9,rdx + + add r8,rbp + adc r9,0 + + + mul r10 + mov rbp,r10 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r8,r10 + sbb rbp,0 + + mul r10 + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,r10 + adc r8,rdx + mov rdx,r10 + adc rbp,0 + + mov rcx,r11 + imul r11,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r9,rax + mov rax,QWORD[rsi] + sbb r10,rdx + + add r9,rbp + adc r10,0 + + + mul r11 + mov rbp,r11 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r9,r11 + sbb rbp,0 + + mul r11 + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + mov rdx,r11 + adc rbp,0 + + shl rax,32 + shr rdx,32 + sub r10,rax + sbb r11,rdx + + add r10,rbp + adc r11,0 + + + xor rdx,rdx + add r8,r12 + adc r9,r13 + mov r12,r8 + adc r10,r14 + adc r11,r15 + mov rax,r9 + adc rdx,0 + + + sub r8,QWORD[rsi] + mov r14,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r15,r11 + sbb r11,QWORD[24+rsi] + sbb rdx,0 + + cmovc r8,r12 + cmovnc rax,r9 + cmovnc r14,r10 + cmovnc r15,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqr + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],rax + pxor xmm1,xmm1 + mov QWORD[16+rdi],r14 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r15 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont: + + +ALIGN 32 +ecp_nistz256_ord_mul_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_mul_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mulx_body: + + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + lea r14,[(($L$ord-128))] + mov r15,QWORD[$L$ordK] + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mulx r11,rbp,r11 + add r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + mulx rax,rdx,r15 + adc r10,rbp + adc r11,rcx + adc r12,0 + + + xor r13,r13 + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r8,rcx + adox r9,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[8+rbx] + adcx r11,rcx + adox r12,rbp + adcx r12,r8 + adox r13,r8 + adc r13,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + mulx rax,rdx,r15 + adcx r12,rcx + adox r13,rbp + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[16+rbx] + adcx r12,rcx + adox r13,rbp + adcx r13,r9 + adox r8,r9 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + mulx rax,rdx,r15 + adcx r13,rcx + adox r8,rbp + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[24+rbx] + adcx r13,rcx + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + mulx rax,rdx,r15 + adcx r8,rcx + adox r9,rbp + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + lea r14,[128+r14] + mov rbx,r12 + adcx r8,rcx + adox r9,rbp + mov rdx,r13 + adcx r9,r11 + adox r10,r11 + adc r10,0 + + + + mov rcx,r8 + sub r12,QWORD[r14] + sbb r13,QWORD[8+r14] + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_montx: + + +ALIGN 32 +ecp_nistz256_ord_sqr_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_sqr_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqrx_body: + + mov rbx,rdx + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[$L$ord] + jmp NEAR $L$oop_ord_sqrx + +ALIGN 32 +$L$oop_ord_sqrx: + mulx r10,r9,r14 + mulx r11,rcx,r15 + mov rax,rdx +DB 102,73,15,110,206 + mulx r12,rbp,r8 + mov rdx,r14 + add r10,rcx +DB 102,73,15,110,215 + adc r11,rbp + adc r12,0 + xor r13,r13 + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + mulx r14,rcx,r8 + mov rdx,rax +DB 102,73,15,110,216 + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + + mulx rbp,r8,rdx +DB 102,72,15,126,202 + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx +DB 102,72,15,126,210 + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + mulx rbp,rcx,rdx +DB 0x67 +DB 102,72,15,126,218 + adox r11,rax + adcx r15,r15 + adox r12,rcx + adox r13,rbp + mulx rax,rcx,rdx + adox r14,rcx + adox r15,rax + + + mov rdx,r8 + mulx rcx,rdx,QWORD[32+rsi] + + xor rax,rax + mulx rbp,rcx,QWORD[rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r9,rcx + adox r10,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r11,rcx + adox r8,rbp + adcx r8,rax + + + mov rdx,r9 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r10,rcx + adcx r11,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r8,rcx + adcx r9,rbp + adox r9,rax + + + mov rdx,r10 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r11,rcx + adox r8,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r9,rcx + adox r10,rbp + adcx r10,rax + + + mov rdx,r11 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r8,rcx + adcx r9,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r10,rcx + adcx r11,rbp + adox r11,rax + + + add r12,r8 + adc r9,r13 + mov rdx,r12 + adc r10,r14 + adc r11,r15 + mov r14,r9 + adc rax,0 + + + sub r12,QWORD[rsi] + mov r15,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r8,r11 + sbb r11,QWORD[24+rsi] + sbb rax,0 + + cmovnc rdx,r12 + cmovnc r14,r9 + cmovnc r15,r10 + cmovnc r8,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqrx + + mov QWORD[rdi],rdx + mov QWORD[8+rdi],r14 + pxor xmm1,xmm1 + mov QWORD[16+rdi],r15 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r8 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_montx: + + + + + + global ecp_nistz256_mul_mont ALIGN 32 @@ -94,13 +1228,26 @@ $L$SEH_begin_ecp_nistz256_mul_mont: mov rdx,r8 + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 $L$mul_mont: push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +$L$mul_body: + cmp ecx,0x80100 + je NEAR $L$mul_montx mov rbx,rdx mov rax,QWORD[rdx] mov r9,QWORD[rsi] @@ -109,16 +1256,39 @@ $L$mul_mont: mov r12,QWORD[24+rsi] call __ecp_nistz256_mul_montq + jmp NEAR $L$mul_mont_done + +ALIGN 32 +$L$mul_montx: + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_mul_montx $L$mul_mont_done: - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_mul_mont: @@ -126,6 +1296,7 @@ ALIGN 32 __ecp_nistz256_mul_montq: + mov rbp,rax mul r9 mov r14,QWORD[(($L$poly+8))] @@ -344,6 +1515,7 @@ __ecp_nistz256_mul_montq: + global ecp_nistz256_sqr_mont ALIGN 32 @@ -356,33 +1528,68 @@ $L$SEH_begin_ecp_nistz256_sqr_mont: mov rsi,rdx + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +$L$sqr_body: + cmp ecx,0x80100 + je NEAR $L$sqr_montx mov rax,QWORD[rsi] mov r14,QWORD[8+rsi] mov r15,QWORD[16+rsi] mov r8,QWORD[24+rsi] call __ecp_nistz256_sqr_montq + jmp NEAR $L$sqr_mont_done + +ALIGN 32 +$L$sqr_montx: + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_sqr_montx $L$sqr_mont_done: - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqr_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_sqr_mont: ALIGN 32 __ecp_nistz256_sqr_montq: + mov r13,rax mul r14 mov r9,rax @@ -543,10 +1750,310 @@ __ecp_nistz256_sqr_montq: +ALIGN 32 +__ecp_nistz256_mul_montx: + + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mov r14,32 + xor r13,r13 + mulx r11,rbp,r11 + mov r15,QWORD[(($L$poly+24))] + adc r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + adc r10,rbp + shlx rbp,r8,r14 + adc r11,rcx + shrx rcx,r8,r14 + adc r12,0 + + + + add r9,rbp + adc r10,rcx + + mulx rbp,rcx,r15 + mov rdx,QWORD[8+rbx] + adc r11,rcx + adc r12,rbp + adc r13,0 + xor r8,r8 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + adcx r12,rcx + shlx rcx,r9,r14 + adox r13,rbp + shrx rbp,r9,r14 + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + + add r10,rcx + adc r11,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[16+rbx] + adc r12,rcx + adc r13,rbp + adc r8,0 + xor r9,r9 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + adcx r13,rcx + shlx rcx,r10,r14 + adox r8,rbp + shrx rbp,r10,r14 + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + + add r11,rcx + adc r12,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[24+rbx] + adc r13,rcx + adc r8,rbp + adc r9,0 + xor r10,r10 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + adcx r8,rcx + shlx rcx,r11,r14 + adox r9,rbp + shrx rbp,r11,r14 + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + + add r12,rcx + adc r13,rbp + + mulx rbp,rcx,r15 + mov rbx,r12 + mov r14,QWORD[(($L$poly+8))] + adc r8,rcx + mov rdx,r13 + adc r9,rbp + adc r10,0 + + + + xor eax,eax + mov rcx,r8 + sbb r12,-1 + sbb r13,r14 + sbb r8,0 + mov rbp,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,rbp + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sqr_montx: + + mulx r10,r9,r14 + mulx r11,rcx,r15 + xor eax,eax + adc r10,rcx + mulx r12,rbp,r8 + mov rdx,r14 + adc r11,rbp + adc r12,0 + xor r13,r13 + + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + + mulx r14,rcx,r8 + mov rdx,QWORD[((0+128))+rsi] + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + mulx rbp,r8,rdx + mov rdx,QWORD[((8+128))+rsi] + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx + mov rdx,QWORD[((16+128))+rsi] + adcx r13,r13 + adox r10,rcx + adcx r14,r14 +DB 0x67 + mulx rbp,rcx,rdx + mov rdx,QWORD[((24+128))+rsi] + adox r11,rax + adcx r15,r15 + adox r12,rcx + mov rsi,32 + adox r13,rbp +DB 0x67,0x67 + mulx rax,rcx,rdx + mov rdx,QWORD[(($L$poly+24))] + adox r14,rcx + shlx rcx,r8,rsi + adox r15,rax + shrx rax,r8,rsi + mov rbp,rdx + + + add r9,rcx + adc r10,rax + + mulx r8,rcx,r8 + adc r11,rcx + shlx rcx,r9,rsi + adc r8,0 + shrx rax,r9,rsi + + + add r10,rcx + adc r11,rax + + mulx r9,rcx,r9 + adc r8,rcx + shlx rcx,r10,rsi + adc r9,0 + shrx rax,r10,rsi + + + add r11,rcx + adc r8,rax + + mulx r10,rcx,r10 + adc r9,rcx + shlx rcx,r11,rsi + adc r10,0 + shrx rax,r11,rsi + + + add r8,rcx + adc r9,rax + + mulx r11,rcx,r11 + adc r10,rcx + adc r11,0 + + xor rdx,rdx + add r12,r8 + mov rsi,QWORD[(($L$poly+8))] + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,r11 + mov r9,r13 + adc rdx,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov r11,r15 + sbb r15,rbp + sbb rdx,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,r11 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + DB 0F3h,0C3h ;repret + + + + global ecp_nistz256_select_w5 ALIGN 32 ecp_nistz256_select_w5: + lea rax,[OPENSSL_ia32cap_P] mov rax,QWORD[8+rax] test eax,32 @@ -625,9 +2132,10 @@ $L$select_loop_sse_w5: movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_select_w5: DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_select_w5: + @@ -635,6 +2143,7 @@ global ecp_nistz256_select_w7 ALIGN 32 ecp_nistz256_select_w7: + lea rax,[OPENSSL_ia32cap_P] mov rax,QWORD[8+rax] test eax,32 @@ -702,17 +2211,20 @@ $L$select_loop_sse_w7: movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_select_w7: DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_select_w7: + ALIGN 32 ecp_nistz256_avx2_select_w5: + $L$avx2_select_w5: vzeroupper lea rax,[((-136))+rsp] + mov r11,rsp $L$SEH_begin_ecp_nistz256_avx2_select_w5: DB 0x48,0x8d,0x60,0xe0 DB 0xc5,0xf8,0x29,0x70,0xe0 @@ -786,10 +2298,11 @@ $L$select_loop_avx2_w5: movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_avx2_select_w5: + lea rsp,[r11] DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_select_w5: + @@ -797,8 +2310,10 @@ global ecp_nistz256_avx2_select_w7 ALIGN 32 ecp_nistz256_avx2_select_w7: + $L$avx2_select_w7: vzeroupper + mov r11,rsp lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_avx2_select_w7: DB 0x48,0x8d,0x60,0xe0 @@ -888,13 +2403,15 @@ $L$select_loop_avx2_w7: movaps xmm13,XMMWORD[112+rsp] movaps xmm14,XMMWORD[128+rsp] movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_ecp_nistz256_avx2_select_w7: + lea rsp,[r11] DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_select_w7: + ALIGN 32 __ecp_nistz256_add_toq: + xor r11,r11 add r12,QWORD[rbx] adc r13,QWORD[8+rbx] @@ -925,8 +2442,10 @@ __ecp_nistz256_add_toq: + ALIGN 32 __ecp_nistz256_sub_fromq: + sub r12,QWORD[rbx] sbb r13,QWORD[8+rbx] mov rax,r12 @@ -956,8 +2475,10 @@ __ecp_nistz256_sub_fromq: + ALIGN 32 __ecp_nistz256_subq: + sub rax,r12 sbb rbp,r13 mov r12,rax @@ -983,8 +2504,10 @@ __ecp_nistz256_subq: + ALIGN 32 __ecp_nistz256_mul_by_2q: + xor r11,r11 add r12,r12 adc r13,r13 @@ -1013,6 +2536,7 @@ __ecp_nistz256_mul_by_2q: DB 0F3h,0C3h ;repret + global ecp_nistz256_point_double ALIGN 32 @@ -1025,14 +2549,28 @@ $L$SEH_begin_ecp_nistz256_point_double: mov rsi,rdx + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_doublex push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*5+8 +$L$point_doubleq_body: + $L$point_double_shortcutq: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi @@ -1214,16 +2752,27 @@ DB 102,72,15,126,203 DB 102,72,15,126,207 call __ecp_nistz256_sub_fromq - add rsp,32*5+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doubleq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_double: global ecp_nistz256_point_add @@ -1238,14 +2787,28 @@ $L$SEH_begin_ecp_nistz256_point_add: mov rdx,r8 + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_addx push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*18+8 +$L$point_addq_body: + movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] @@ -1390,15 +2953,22 @@ DB 102,73,15,110,220 or r12,r8 or r12,r9 -DB 0x3e - jnz NEAR $L$add_proceedq DB 102,73,15,126,208 DB 102,73,15,126,217 - test r8,r8 + or r12,r8 +DB 0x3e jnz NEAR $L$add_proceedq + + + test r9,r9 jz NEAR $L$add_doubleq + + + + + DB 102,72,15,126,199 pxor xmm0,xmm0 movdqu XMMWORD[rdi],xmm0 @@ -1414,8 +2984,10 @@ $L$add_doubleq: DB 102,72,15,126,206 DB 102,72,15,126,199 add rsp,416 + jmp NEAR $L$point_double_shortcutq + ALIGN 32 $L$add_proceedq: mov rax,QWORD[((0+64))+rsp] @@ -1621,16 +3193,27 @@ DB 102,72,15,126,199 movdqu XMMWORD[48+rdi],xmm3 $L$add_doneq: - add rsp,32*18+8 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - pop rbp + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_ecp_nistz256_point_add: global ecp_nistz256_point_add_affine @@ -1645,14 +3228,28 @@ $L$SEH_begin_ecp_nistz256_point_add_affine: mov rdx,r8 + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_add_affinex push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*15+8 +$L$add_affineq_body: + movdqu xmm0,XMMWORD[rsi] mov rbx,rdx movdqu xmm1,XMMWORD[16+rsi] @@ -1934,14 +3531,1454 @@ DB 102,72,15,126,199 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 - add rsp,32*15+8 + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affineq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_add_affine: + +ALIGN 32 +__ecp_nistz256_add_tox: + + xor r11,r11 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromx: + + xor r11,r11 + sbb r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,0 + + xor r10,r10 + adc r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + + bt r11,0 + cmovnc r12,rax + cmovnc r13,rbp + mov QWORD[rdi],r12 + cmovnc r8,rcx + mov QWORD[8+rdi],r13 + cmovnc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_subx: + + xor r11,r11 + sbb rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,0 + + xor r9,r9 + adc rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + + bt r11,0 + cmovc r12,rax + cmovc r13,rbp + cmovc r8,rcx + cmovc r9,r10 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2x: + + xor r11,r11 + adc r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +ecp_nistz256_point_doublex: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_doublex: + mov rdi,rcx + mov rsi,rdx + + + +$L$point_doublex: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doublex_body: + +$L$point_double_shortcutx: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-128))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-128))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rdx,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subx + + mov rdx,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-128))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doublex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_doublex: + +ALIGN 32 +ecp_nistz256_point_addx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_addx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$point_addx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addx_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 +DB 102,72,15,110,199 + + lea rsi,[((64-128))+rsi] + mov QWORD[((544+0))+rsp],rdx + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rdx,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-128))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((-128+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 102,73,15,126,208 +DB 102,73,15,126,217 + or r12,r8 +DB 0x3e + jnz NEAR $L$add_proceedx + + + + test r9,r9 + jz NEAR $L$add_doublex + + + + + + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_donex + +ALIGN 32 +$L$add_doublex: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + + jmp NEAR $L$point_double_shortcutx + + +ALIGN 32 +$L$add_proceedx: + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((-128+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_donex: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_addx: + +ALIGN 32 +ecp_nistz256_point_add_affinex: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affinex: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$point_add_affinex: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affinex_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-128))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rdx,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-128))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((-128+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((-128+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affinex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_point_add_affinex: +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[16+rax] + + mov r12,QWORD[((-8))+rax] + mov r13,QWORD[((-16))+rax] + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq pop r15 pop r14 pop r13 pop r12 - pop rbx pop rbp - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + pop rbx + pop rdi + pop rsi DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_add_affine: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w5 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w5 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w7 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w7 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_avx2_select_w5 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_avx2_select_w5 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_avx2_select_w7 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_avx2_select_w7 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ecp_nistz256_neg: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_ord_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_mul_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_select_wX: +DB 0x01,0x33,0x16,0x00 +DB 0x33,0xf8,0x09,0x00 +DB 0x2e,0xe8,0x08,0x00 +DB 0x29,0xd8,0x07,0x00 +DB 0x24,0xc8,0x06,0x00 +DB 0x1f,0xb8,0x05,0x00 +DB 0x1a,0xa8,0x04,0x00 +DB 0x15,0x98,0x03,0x00 +DB 0x10,0x88,0x02,0x00 +DB 0x0c,0x78,0x01,0x00 +DB 0x08,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_avx2_select_wX: +DB 0x01,0x36,0x17,0x0b +DB 0x36,0xf8,0x09,0x00 +DB 0x31,0xe8,0x08,0x00 +DB 0x2c,0xd8,0x07,0x00 +DB 0x27,0xc8,0x06,0x00 +DB 0x22,0xb8,0x05,0x00 +DB 0x1d,0xa8,0x04,0x00 +DB 0x18,0x98,0x03,0x00 +DB 0x13,0x88,0x02,0x00 +DB 0x0e,0x78,0x01,0x00 +DB 0x09,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +DB 0x00,0xb3,0x00,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase + DD 32*15+56,0 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_doublex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_addx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affinex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase + DD 32*15+56,0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm new file mode 100644 index 0000000000..563699d59d --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm @@ -0,0 +1,339 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + + +global beeu_mod_inverse_vartime +ALIGN 32 +beeu_mod_inverse_vartime: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_beeu_mod_inverse_vartime: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + push rbx + + push rsi + + + sub rsp,80 + + mov QWORD[rsp],rdi + + + mov r8,1 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor rdi,rdi + + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + xor rbp,rbp + + + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu XMMWORD[48+rsp],xmm0 + vmovdqu XMMWORD[64+rsp],xmm1 + + vmovdqu xmm0,XMMWORD[rdx] + vmovdqu xmm1,XMMWORD[16+rdx] + vmovdqu XMMWORD[16+rsp],xmm0 + vmovdqu XMMWORD[32+rsp],xmm1 + +$L$beeu_loop: + xor rbx,rbx + or rbx,QWORD[48+rsp] + or rbx,QWORD[56+rsp] + or rbx,QWORD[64+rsp] + or rbx,QWORD[72+rsp] + jz NEAR $L$beeu_loop_end + + + + + + + + + + + mov rcx,1 + + +$L$beeu_shift_loop_XB: + mov rbx,rcx + and rbx,QWORD[48+rsp] + jnz NEAR $L$beeu_shift_loop_end_XB + + + mov rbx,1 + and rbx,r8 + jz NEAR $L$shift1_0 + add r8,QWORD[rdx] + adc r9,QWORD[8+rdx] + adc r10,QWORD[16+rdx] + adc r11,QWORD[24+rdx] + adc rdi,0 + +$L$shift1_0: + shrd r8,r9,1 + shrd r9,r10,1 + shrd r10,r11,1 + shrd r11,rdi,1 + shr rdi,1 + + shl rcx,1 + + + + + + cmp rcx,0x8000000 + jne NEAR $L$beeu_shift_loop_XB + +$L$beeu_shift_loop_end_XB: + bsf rcx,rcx + test rcx,rcx + jz NEAR $L$beeu_no_shift_XB + + + + mov rax,QWORD[((8+48))+rsp] + mov rbx,QWORD[((16+48))+rsp] + mov rsi,QWORD[((24+48))+rsp] + + shrd QWORD[((0+48))+rsp],rax,cl + shrd QWORD[((8+48))+rsp],rbx,cl + shrd QWORD[((16+48))+rsp],rsi,cl + + shr rsi,cl + mov QWORD[((24+48))+rsp],rsi + + +$L$beeu_no_shift_XB: + + mov rcx,1 + + +$L$beeu_shift_loop_YA: + mov rbx,rcx + and rbx,QWORD[16+rsp] + jnz NEAR $L$beeu_shift_loop_end_YA + + + mov rbx,1 + and rbx,r12 + jz NEAR $L$shift1_1 + add r12,QWORD[rdx] + adc r13,QWORD[8+rdx] + adc r14,QWORD[16+rdx] + adc r15,QWORD[24+rdx] + adc rbp,0 + +$L$shift1_1: + shrd r12,r13,1 + shrd r13,r14,1 + shrd r14,r15,1 + shrd r15,rbp,1 + shr rbp,1 + + shl rcx,1 + + + + + + cmp rcx,0x8000000 + jne NEAR $L$beeu_shift_loop_YA + +$L$beeu_shift_loop_end_YA: + bsf rcx,rcx + test rcx,rcx + jz NEAR $L$beeu_no_shift_YA + + + + mov rax,QWORD[((8+16))+rsp] + mov rbx,QWORD[((16+16))+rsp] + mov rsi,QWORD[((24+16))+rsp] + + shrd QWORD[((0+16))+rsp],rax,cl + shrd QWORD[((8+16))+rsp],rbx,cl + shrd QWORD[((16+16))+rsp],rsi,cl + + shr rsi,cl + mov QWORD[((24+16))+rsp],rsi + + +$L$beeu_no_shift_YA: + + mov rax,QWORD[48+rsp] + mov rbx,QWORD[56+rsp] + mov rsi,QWORD[64+rsp] + mov rcx,QWORD[72+rsp] + sub rax,QWORD[16+rsp] + sbb rbx,QWORD[24+rsp] + sbb rsi,QWORD[32+rsp] + sbb rcx,QWORD[40+rsp] + jnc NEAR $L$beeu_B_bigger_than_A + + + mov rax,QWORD[16+rsp] + mov rbx,QWORD[24+rsp] + mov rsi,QWORD[32+rsp] + mov rcx,QWORD[40+rsp] + sub rax,QWORD[48+rsp] + sbb rbx,QWORD[56+rsp] + sbb rsi,QWORD[64+rsp] + sbb rcx,QWORD[72+rsp] + mov QWORD[16+rsp],rax + mov QWORD[24+rsp],rbx + mov QWORD[32+rsp],rsi + mov QWORD[40+rsp],rcx + + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,r11 + adc rbp,rdi + jmp NEAR $L$beeu_loop + +$L$beeu_B_bigger_than_A: + + mov QWORD[48+rsp],rax + mov QWORD[56+rsp],rbx + mov QWORD[64+rsp],rsi + mov QWORD[72+rsp],rcx + + + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rdi,rbp + + jmp NEAR $L$beeu_loop + +$L$beeu_loop_end: + + + + + mov rbx,QWORD[16+rsp] + sub rbx,1 + or rbx,QWORD[24+rsp] + or rbx,QWORD[32+rsp] + or rbx,QWORD[40+rsp] + + jnz NEAR $L$beeu_err + + + + + mov r8,QWORD[rdx] + mov r9,QWORD[8+rdx] + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + xor rdi,rdi + +$L$beeu_reduction_loop: + mov QWORD[16+rsp],r12 + mov QWORD[24+rsp],r13 + mov QWORD[32+rsp],r14 + mov QWORD[40+rsp],r15 + mov QWORD[48+rsp],rbp + + + sub r12,r8 + sbb r13,r9 + sbb r14,r10 + sbb r15,r11 + sbb rbp,0 + + + cmovc r12,QWORD[16+rsp] + cmovc r13,QWORD[24+rsp] + cmovc r14,QWORD[32+rsp] + cmovc r15,QWORD[40+rsp] + jnc NEAR $L$beeu_reduction_loop + + + sub r8,r12 + sbb r9,r13 + sbb r10,r14 + sbb r11,r15 + +$L$beeu_save: + + mov rdi,QWORD[rsp] + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + + mov rax,1 + jmp NEAR $L$beeu_finish + +$L$beeu_err: + + xor rax,rax + +$L$beeu_finish: + add rsp,80 + + pop rsi + + pop rbx + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +$L$SEH_end_beeu_mod_inverse_vartime: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm index 4c03791b48..89b91de10d 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -12,59 +19,40 @@ global CRYPTO_rdrand ALIGN 16 CRYPTO_rdrand: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_CRYPTO_rdrand: - mov rdi,rcx - xor rax,rax - - -DB 0x48,0x0f,0xc7,0xf1 +DB 73,15,199,240 adc rax,rax - mov QWORD[rdi],rcx - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + mov QWORD[rcx],r8 DB 0F3h,0C3h ;repret + + global CRYPTO_rdrand_multiple8_buf ALIGN 16 CRYPTO_rdrand_multiple8_buf: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_CRYPTO_rdrand_multiple8_buf: - mov rdi,rcx - mov rsi,rdx - - test rsi,rsi + test rdx,rdx jz NEAR $L$out - mov rdx,8 + mov r8,8 $L$loop: - - -DB 0x48,0x0f,0xc7,0xf1 +DB 73,15,199,241 jnc NEAR $L$err - mov QWORD[rdi],rcx - add rdi,rdx - sub rsi,rdx + mov QWORD[rcx],r9 + add rcx,r8 + sub rdx,r8 jnz NEAR $L$loop $L$out: mov rax,1 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$err: xor rax,rax - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + + diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm index a06e6f6cd6..74e2705cb9 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -103,7 +110,7 @@ $L$sqr_1024_no_n_copy: vmovdqu ymm8,YMMWORD[((256-128))+rsi] lea rbx,[192+rsp] - vpbroadcastq ymm15,QWORD[$L$and_mask] + vmovdqu ymm15,YMMWORD[$L$and_mask] jmp NEAR $L$OOP_GRANDE_SQR_1024 ALIGN 32 @@ -891,10 +898,10 @@ $L$oop_mul_1024: vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx] vpaddq ymm6,ymm6,ymm12 vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx] - vpblendd ymm9,ymm9,ymm14,3 + vpblendd ymm12,ymm9,ymm14,3 vpaddq ymm7,ymm7,ymm13 vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx] - vpaddq ymm3,ymm3,ymm9 + vpaddq ymm3,ymm3,ymm12 vpaddq ymm8,ymm8,ymm0 mov rax,rbx @@ -907,7 +914,9 @@ $L$oop_mul_1024: vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi] mov rax,r10 + vpblendd ymm9,ymm9,ymm14,0xfc imul eax,r8d + vpaddq ymm4,ymm4,ymm9 and eax,0x1fffffff imul rbx,QWORD[((16-128))+rsi] @@ -1136,7 +1145,6 @@ $L$oop_mul_1024: dec r14d jnz NEAR $L$oop_mul_1024 - vpermq ymm15,ymm15,0 vpaddq ymm0,ymm12,YMMWORD[rsp] vpsrlq ymm12,ymm0,29 @@ -1289,6 +1297,7 @@ global rsaz_1024_red2norm_avx2 ALIGN 32 rsaz_1024_red2norm_avx2: + sub rdx,-128 xor rax,rax mov r8,QWORD[((-128))+rdx] @@ -1482,10 +1491,12 @@ rsaz_1024_red2norm_avx2: DB 0F3h,0C3h ;repret + global rsaz_1024_norm2red_avx2 ALIGN 32 rsaz_1024_norm2red_avx2: + sub rcx,-128 mov r8,QWORD[rdx] mov eax,0x1fffffff @@ -1639,10 +1650,12 @@ rsaz_1024_norm2red_avx2: mov QWORD[184+rcx],r8 DB 0F3h,0C3h ;repret + global rsaz_1024_scatter5_avx2 ALIGN 32 rsaz_1024_scatter5_avx2: + vzeroupper vmovdqu ymm5,YMMWORD[$L$scatter_permd] shl r8d,4 @@ -1664,6 +1677,7 @@ $L$oop_scatter_1024: DB 0F3h,0C3h ;repret + global rsaz_1024_gather5_avx2 ALIGN 32 @@ -1809,21 +1823,9 @@ $L$oop_gather_1024: $L$SEH_end_rsaz_1024_gather5: -EXTERN OPENSSL_ia32cap_P -global rsaz_avx2_eligible - -ALIGN 32 -rsaz_avx2_eligible: - lea rax,[OPENSSL_ia32cap_P] - mov eax,DWORD[8+rax] - and eax,32 - shr eax,5 - DB 0F3h,0C3h ;repret - - ALIGN 64 $L$and_mask: - DQ 0x1fffffff,0x1fffffff,0x1fffffff,-1 + DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff $L$scatter_permd: DD 0,2,4,6,7,7,7,7 $L$gather_permd: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm index 65b040fb43..62dcc62c25 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 EXTERN OPENSSL_ia32cap_P @@ -19,6 +26,7 @@ $L$SEH_begin_sha1_block_data_order: mov rdx,r8 + lea r10,[OPENSSL_ia32cap_P] mov r9d,DWORD[r10] mov r8d,DWORD[4+r10] @@ -35,17 +43,24 @@ $L$SEH_begin_sha1_block_data_order: ALIGN 16 $L$ialu: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + mov r8,rdi sub rsp,72 mov r9,rsi and rsp,-64 mov r10,rdx mov QWORD[64+rsp],rax + $L$prologue: mov esi,DWORD[r8] @@ -1240,16 +1255,24 @@ $L$loop: jnz NEAR $L$loop mov rsi,QWORD[64+rsp] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order: ALIGN 16 @@ -1264,12 +1287,19 @@ $L$SEH_begin_sha1_block_data_order_ssse3: _ssse3_shortcut: + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] movaps XMMWORD[(-40-96)+r11],xmm6 movaps XMMWORD[(-40-80)+r11],xmm7 @@ -2439,15 +2469,22 @@ $L$done_ssse3: movaps xmm10,XMMWORD[((-40-32))+r11] movaps xmm11,XMMWORD[((-40-16))+r11] mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_ssse3: ALIGN 16 @@ -2462,12 +2499,19 @@ $L$SEH_begin_sha1_block_data_order_avx: _avx_shortcut: + mov r11,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] vzeroupper vmovaps XMMWORD[(-40-96)+r11],xmm6 @@ -3577,15 +3621,22 @@ $L$done_avx: movaps xmm10,XMMWORD[((-40-32))+r11] movaps xmm11,XMMWORD[((-40-16))+r11] mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha1_block_data_order_avx: ALIGN 64 K_XX_XX: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm index 6e3d1541ac..68c74cc1b9 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -19,6 +26,7 @@ $L$SEH_begin_sha256_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] @@ -31,12 +39,19 @@ $L$SEH_begin_sha256_block_data_order: test r10d,512 jnz NEAR $L$ssse3_shortcut mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,16*4+4*8 lea rdx,[rdx*4+rsi] @@ -44,7 +59,8 @@ $L$SEH_begin_sha256_block_data_order: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],rax + mov QWORD[88+rsp],rax + $L$prologue: mov eax,DWORD[rdi] @@ -1708,18 +1724,27 @@ $L$rounds_16_xx: mov DWORD[28+rdi],r11d jb NEAR $L$loop - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order: ALIGN 64 @@ -1780,14 +1805,22 @@ $L$SEH_begin_sha256_block_data_order_ssse3: mov rdx,r8 + $L$ssse3_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -1795,7 +1828,8 @@ $L$ssse3_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],rax + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -2865,22 +2899,31 @@ DB 102,15,58,15,249,4 mov DWORD[28+rdi],r11d jb NEAR $L$loop_ssse3 - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_ssse3: ALIGN 64 @@ -2894,14 +2937,22 @@ $L$SEH_begin_sha256_block_data_order_avx: mov rdx,r8 + $L$avx_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -2909,7 +2960,8 @@ $L$avx_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],rax + mov QWORD[88+rsp],rax + movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -3940,23 +3992,32 @@ $L$avx_00_47: mov DWORD[28+rdi],r11d jb NEAR $L$loop_avx - mov rsi,QWORD[((64+24))+rsp] + mov rsi,QWORD[88+rsp] + vzeroupper movaps xmm6,XMMWORD[((64+32))+rsp] movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm index d0d7a43fbe..33dc2c2ede 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -19,24 +26,30 @@ $L$SEH_begin_sha512_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] - test r10d,2048 - jnz NEAR $L$xop_shortcut and r9d,1073741824 and r10d,268435968 or r10d,r9d cmp r10d,1342177792 je NEAR $L$avx_shortcut mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,16*8+4*8 lea rdx,[rdx*8+rsi] @@ -44,7 +57,8 @@ $L$SEH_begin_sha512_block_data_order: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],rax + mov QWORD[152+rsp],rax + $L$prologue: mov rax,QWORD[rdi] @@ -1708,18 +1722,27 @@ $L$rounds_16_xx: mov QWORD[56+rdi],r11 jb NEAR $L$loop - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order: ALIGN 64 @@ -1813,1112 +1836,6 @@ DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 -ALIGN 64 -sha512_block_data_order_xop: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha512_block_data_order_xop: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - -$L$xop_shortcut: - mov rax,rsp - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - shl rdx,4 - sub rsp,256 - lea rdx,[rdx*8+rsi] - and rsp,-64 - mov QWORD[((128+0))+rsp],rdi - mov QWORD[((128+8))+rsp],rsi - mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],rax - movaps XMMWORD[(128+32)+rsp],xmm6 - movaps XMMWORD[(128+48)+rsp],xmm7 - movaps XMMWORD[(128+64)+rsp],xmm8 - movaps XMMWORD[(128+80)+rsp],xmm9 - movaps XMMWORD[(128+96)+rsp],xmm10 - movaps XMMWORD[(128+112)+rsp],xmm11 -$L$prologue_xop: - - vzeroupper - mov rax,QWORD[rdi] - mov rbx,QWORD[8+rdi] - mov rcx,QWORD[16+rdi] - mov rdx,QWORD[24+rdi] - mov r8,QWORD[32+rdi] - mov r9,QWORD[40+rdi] - mov r10,QWORD[48+rdi] - mov r11,QWORD[56+rdi] - jmp NEAR $L$loop_xop -ALIGN 16 -$L$loop_xop: - vmovdqa xmm11,XMMWORD[((K512+1280))] - vmovdqu xmm0,XMMWORD[rsi] - lea rbp,[((K512+128))] - vmovdqu xmm1,XMMWORD[16+rsi] - vmovdqu xmm2,XMMWORD[32+rsi] - vpshufb xmm0,xmm0,xmm11 - vmovdqu xmm3,XMMWORD[48+rsi] - vpshufb xmm1,xmm1,xmm11 - vmovdqu xmm4,XMMWORD[64+rsi] - vpshufb xmm2,xmm2,xmm11 - vmovdqu xmm5,XMMWORD[80+rsi] - vpshufb xmm3,xmm3,xmm11 - vmovdqu xmm6,XMMWORD[96+rsi] - vpshufb xmm4,xmm4,xmm11 - vmovdqu xmm7,XMMWORD[112+rsi] - vpshufb xmm5,xmm5,xmm11 - vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] - vpshufb xmm6,xmm6,xmm11 - vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] - vpshufb xmm7,xmm7,xmm11 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] - vmovdqa XMMWORD[rsp],xmm8 - vpaddq xmm8,xmm4,XMMWORD[rbp] - vmovdqa XMMWORD[16+rsp],xmm9 - vpaddq xmm9,xmm5,XMMWORD[32+rbp] - vmovdqa XMMWORD[32+rsp],xmm10 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - vmovdqa XMMWORD[48+rsp],xmm11 - vpaddq xmm11,xmm7,XMMWORD[96+rbp] - vmovdqa XMMWORD[64+rsp],xmm8 - mov r14,rax - vmovdqa XMMWORD[80+rsp],xmm9 - mov rdi,rbx - vmovdqa XMMWORD[96+rsp],xmm10 - xor rdi,rcx - vmovdqa XMMWORD[112+rsp],xmm11 - mov r13,r8 - jmp NEAR $L$xop_00_47 - -ALIGN 16 -$L$xop_00_47: - add rbp,256 - vpalignr xmm8,xmm1,xmm0,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm5,xmm4,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm0,xmm0,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,223,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm7,6 - add rdx,r11 - add r11,rdi - vpaddq xmm0,xmm0,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm0,xmm0,xmm11 - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[rsp],xmm10 - vpalignr xmm8,xmm2,xmm1,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm6,xmm5,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm1,xmm1,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,216,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm0,6 - add rbx,r9 - add r9,rdi - vpaddq xmm1,xmm1,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm1,xmm1,xmm11 - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[16+rsp],xmm10 - vpalignr xmm8,xmm3,xmm2,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm7,xmm6,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm2,xmm2,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,217,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm1,6 - add r11,rdx - add rdx,rdi - vpaddq xmm2,xmm2,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm2,xmm2,xmm11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[32+rsp],xmm10 - vpalignr xmm8,xmm4,xmm3,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm0,xmm7,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm3,xmm3,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,218,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm2,6 - add r9,rbx - add rbx,rdi - vpaddq xmm3,xmm3,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm3,xmm3,xmm11 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[48+rsp],xmm10 - vpalignr xmm8,xmm5,xmm4,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm1,xmm0,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm4,xmm4,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,219,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm3,6 - add rdx,r11 - add r11,rdi - vpaddq xmm4,xmm4,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm4,xmm4,xmm11 - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm4,XMMWORD[rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[64+rsp],xmm10 - vpalignr xmm8,xmm6,xmm5,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm2,xmm1,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm5,xmm5,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,220,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm4,6 - add rbx,r9 - add r9,rdi - vpaddq xmm5,xmm5,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm5,xmm5,xmm11 - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm5,XMMWORD[32+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[80+rsp],xmm10 - vpalignr xmm8,xmm7,xmm6,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm3,xmm2,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm6,xmm6,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,221,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm5,6 - add r11,rdx - add rdx,rdi - vpaddq xmm6,xmm6,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm6,xmm6,xmm11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[96+rsp],xmm10 - vpalignr xmm8,xmm0,xmm7,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm4,xmm3,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm7,xmm7,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,222,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm6,6 - add r9,rbx - add rbx,rdi - vpaddq xmm7,xmm7,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm7,xmm7,xmm11 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm7,XMMWORD[96+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[112+rsp],xmm10 - cmp BYTE[135+rbp],0 - jne NEAR $L$xop_00_47 - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - mov rdi,QWORD[((128+0))+rsp] - mov rax,r14 - - add rax,QWORD[rdi] - lea rsi,[128+rsi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - add r10,QWORD[48+rdi] - add r11,QWORD[56+rdi] - - cmp rsi,QWORD[((128+16))+rsp] - - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - jb NEAR $L$loop_xop - - mov rsi,QWORD[((128+24))+rsp] - vzeroupper - movaps xmm6,XMMWORD[((128+32))+rsp] - movaps xmm7,XMMWORD[((128+48))+rsp] - movaps xmm8,XMMWORD[((128+64))+rsp] - movaps xmm9,XMMWORD[((128+80))+rsp] - movaps xmm10,XMMWORD[((128+96))+rsp] - movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[((-48))+rsi] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] -$L$epilogue_xop: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_sha512_block_data_order_xop: - ALIGN 64 sha512_block_data_order_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue @@ -2930,14 +1847,22 @@ $L$SEH_begin_sha512_block_data_order_avx: mov rdx,r8 + $L$avx_shortcut: mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -2945,7 +1870,8 @@ $L$avx_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],rax + mov QWORD[152+rsp],rax + movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -4068,7 +2994,8 @@ $L$avx_00_47: mov QWORD[56+rdi],r11 jb NEAR $L$loop_avx - mov rsi,QWORD[((128+24))+rsp] + mov rsi,QWORD[152+rsp] + vzeroupper movaps xmm6,XMMWORD[((128+32))+rsp] movaps xmm7,XMMWORD[((128+48))+rsp] @@ -4077,16 +3004,24 @@ $L$avx_00_47: movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_sha512_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind @@ -4189,9 +3124,6 @@ ALIGN 4 DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase DD $L$SEH_end_sha512_block_data_order wrt ..imagebase DD $L$SEH_info_sha512_block_data_order wrt ..imagebase - DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase @@ -4201,10 +3133,6 @@ $L$SEH_info_sha512_block_data_order: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase -$L$SEH_info_sha512_block_data_order_xop: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase $L$SEH_info_sha512_block_data_order_avx: DB 9,0,0,0 DD se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm index 3edde9fdbc..ccfc870a66 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -23,6 +30,7 @@ section .text code align=64 ALIGN 16 _vpaes_encrypt_core: + mov r9,rdx mov r11,16 mov eax,DWORD[240+rdx] @@ -111,8 +119,185 @@ DB 102,15,56,0,193 + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core_2x: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + movdqa xmm8,xmm2 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + movdqu xmm5,XMMWORD[r9] + + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,208 +DB 102,68,15,56,0,198 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + movdqa xmm6,xmm0 +DB 102,15,56,0,193 +DB 102,15,56,0,247 + pxor xmm2,xmm5 + pxor xmm8,xmm5 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc2x_entry + +ALIGN 16 +$L$enc2x_loop: + + movdqa xmm4,XMMWORD[$L$k_sb1] + movdqa xmm0,XMMWORD[(($L$k_sb1+16))] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + movdqa xmm5,XMMWORD[$L$k_sb2] + movdqa xmm13,xmm5 + pxor xmm0,xmm4 + pxor xmm6,xmm12 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + +DB 102,15,56,0,234 +DB 102,69,15,56,0,232 + movdqa xmm4,XMMWORD[r10*1+r11] + + movdqa xmm2,XMMWORD[(($L$k_sb2+16))] + movdqa xmm8,xmm2 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm0 + movdqa xmm11,xmm6 + pxor xmm2,xmm5 + pxor xmm8,xmm13 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 +DB 102,15,56,0,220 +DB 102,68,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 + pxor xmm11,xmm6 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + pxor xmm6,xmm11 + +$L$enc2x_entry: + + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm5,XMMWORD[(($L$k_inv+16))] + movdqa xmm13,xmm5 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,232 +DB 102,68,15,56,0,238 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm0,xmm1 + pxor xmm6,xmm7 +DB 102,15,56,0,217 +DB 102,68,15,56,0,223 + movdqa xmm4,xmm10 + movdqa xmm12,xmm10 + pxor xmm3,xmm5 + pxor xmm11,xmm13 +DB 102,15,56,0,224 +DB 102,68,15,56,0,230 + movdqa xmm2,xmm10 + movdqa xmm8,xmm10 + pxor xmm4,xmm5 + pxor xmm12,xmm13 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm2,xmm0 + pxor xmm8,xmm6 +DB 102,15,56,0,220 +DB 102,69,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + + pxor xmm3,xmm1 + pxor xmm11,xmm7 + jnz NEAR $L$enc2x_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 + pxor xmm4,xmm5 + pxor xmm12,xmm5 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + movdqa xmm1,XMMWORD[64+r10*1+r11] + + pxor xmm0,xmm4 + pxor xmm6,xmm12 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + DB 0F3h,0C3h ;repret + + + + + + + + + ALIGN 16 _vpaes_decrypt_core: + mov r9,rdx mov eax,DWORD[240+rdx] movdqa xmm1,xmm9 @@ -217,6 +402,7 @@ DB 102,15,56,0,194 + ALIGN 16 _vpaes_schedule_core: @@ -224,6 +410,7 @@ _vpaes_schedule_core: + call _vpaes_preheat movdqa xmm8,XMMWORD[$L$k_rcon] movdqu xmm0,XMMWORD[rdi] @@ -402,8 +589,10 @@ $L$schedule_mangle_last_dec: + ALIGN 16 _vpaes_schedule_192_smear: + pshufd xmm1,xmm6,0x80 pshufd xmm0,xmm7,0xFE pxor xmm6,xmm1 @@ -431,11 +620,13 @@ _vpaes_schedule_192_smear: + ALIGN 16 _vpaes_schedule_round: + pxor xmm1,xmm1 DB 102,65,15,58,15,200,15 DB 102,69,15,58,15,192,15 @@ -500,8 +691,10 @@ DB 102,15,56,0,195 + ALIGN 16 _vpaes_schedule_transform: + movdqa xmm1,xmm9 pandn xmm1,xmm0 psrld xmm1,4 @@ -536,10 +729,12 @@ DB 102,15,56,0,193 + ALIGN 16 _vpaes_schedule_mangle: + movdqa xmm4,xmm0 movdqa xmm5,XMMWORD[$L$k_mc_forward] test rcx,rcx @@ -609,6 +804,7 @@ DB 102,15,56,0,217 + global vpaes_set_encrypt_key ALIGN 16 @@ -622,6 +818,12 @@ $L$SEH_begin_vpaes_set_encrypt_key: mov rdx,r8 + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+5))],1 +%endif + lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -658,6 +860,7 @@ $L$enc_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_set_encrypt_key: global vpaes_set_decrypt_key @@ -673,6 +876,7 @@ $L$SEH_begin_vpaes_set_decrypt_key: mov rdx,r8 + lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -714,6 +918,7 @@ $L$dec_key_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_set_decrypt_key: global vpaes_encrypt @@ -729,6 +934,11 @@ $L$SEH_begin_vpaes_encrypt: mov rdx,r8 + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+4))],1 +%endif lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -760,6 +970,7 @@ $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_encrypt: global vpaes_decrypt @@ -775,6 +986,7 @@ $L$SEH_begin_vpaes_decrypt: mov rdx,r8 + lea rsp,[((-184))+rsp] movaps XMMWORD[16+rsp],xmm6 movaps XMMWORD[32+rsp],xmm7 @@ -806,6 +1018,7 @@ $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_decrypt: global vpaes_cbc_encrypt @@ -823,6 +1036,7 @@ $L$SEH_begin_vpaes_cbc_encrypt: mov r9,QWORD[48+rsp] + xchg rdx,rcx sub rcx,16 jc NEAR $L$cbc_abort @@ -884,7 +1098,107 @@ $L$cbc_abort: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_vpaes_cbc_encrypt: +global vpaes_ctr32_encrypt_blocks + +ALIGN 16 +vpaes_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + + xchg rdx,rcx + test rcx,rcx + jz NEAR $L$ctr32_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$ctr32_body: + movdqu xmm0,XMMWORD[r8] + movdqa xmm8,XMMWORD[$L$ctr_add_one] + sub rsi,rdi + call _vpaes_preheat + movdqa xmm6,xmm0 + pshufb xmm6,XMMWORD[$L$rev_ctr] + + test rcx,1 + jz NEAR $L$ctr32_prep_loop + + + + movdqu xmm7,XMMWORD[rdi] + call _vpaes_encrypt_core + pxor xmm0,xmm7 + paddd xmm6,xmm8 + movdqu XMMWORD[rdi*1+rsi],xmm0 + sub rcx,1 + lea rdi,[16+rdi] + jz NEAR $L$ctr32_done + +$L$ctr32_prep_loop: + + + movdqa xmm14,xmm6 + movdqa xmm15,xmm6 + paddd xmm15,xmm8 + +$L$ctr32_loop: + movdqa xmm1,XMMWORD[$L$rev_ctr] + movdqa xmm0,xmm14 + movdqa xmm6,xmm15 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm3,XMMWORD[$L$ctr_add_two] + pxor xmm0,xmm1 + pxor xmm6,xmm2 + paddd xmm14,xmm3 + paddd xmm15,xmm3 + movdqu XMMWORD[rdi*1+rsi],xmm0 + movdqu XMMWORD[16+rdi*1+rsi],xmm6 + sub rcx,2 + lea rdi,[32+rdi] + jnz NEAR $L$ctr32_loop + +$L$ctr32_done: + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$ctr32_epilogue: +$L$ctr32_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_vpaes_ctr32_encrypt_blocks: @@ -894,6 +1208,7 @@ $L$SEH_end_vpaes_cbc_encrypt: ALIGN 16 _vpaes_preheat: + lea r10,[$L$k_s0F] movdqa xmm10,XMMWORD[((-32))+r10] movdqa xmm11,XMMWORD[((-16))+r10] @@ -910,6 +1225,7 @@ _vpaes_preheat: + ALIGN 64 _vpaes_consts: $L$k_inv: @@ -1005,6 +1321,17 @@ $L$k_dsbe: $L$k_dsbo: DQ 0x1387EA537EF94000,0xC7AA6DB9D4943E2D DQ 0x12D7560F93441D00,0xCA4B8159D8C58E9C + + +$L$rev_ctr: + DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 + + +$L$ctr_add_one: + DQ 0x0000000000000000,0x0000000100000000 +$L$ctr_add_two: + DQ 0x0000000000000000,0x0000000200000000 + DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 @@ -1113,6 +1440,10 @@ ALIGN 4 DD $L$SEH_end_vpaes_cbc_encrypt wrt ..imagebase DD $L$SEH_info_vpaes_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase + section .xdata rdata align=8 ALIGN 8 $L$SEH_info_vpaes_set_encrypt_key: @@ -1135,3 +1466,7 @@ $L$SEH_info_vpaes_cbc_encrypt: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$cbc_body wrt ..imagebase,$L$cbc_epilogue wrt ..imagebase +$L$SEH_info_vpaes_ctr32_encrypt_blocks: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm index dd93341d8f..d6d8bdd6d4 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -31,6 +38,8 @@ $L$SEH_begin_bn_mul_mont: jnz NEAR $L$mul_enter cmp r9d,8 jb NEAR $L$mul_enter + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] cmp rdx,rsi jne NEAR $L$mul4x_enter test r9d,7 @@ -222,31 +231,30 @@ $L$inner_enter: xor r14,r14 mov rax,QWORD[rsp] - lea rsi,[rsp] mov r15,r9 - jmp NEAR $L$sub + ALIGN 16 -$L$sub: - sbb rax,QWORD[r14*8+rcx] +$L$sub: sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax - mov rax,QWORD[8+r14*8+rsi] + mov rax,QWORD[8+r14*8+rsp] lea r14,[1+r14] dec r15 jnz NEAR $L$sub sbb rax,0 + mov rbx,-1 + xor rbx,rax xor r14,r14 - and rsi,rax - not rax - mov rcx,rdi - and rcx,rax mov r15,r9 - or rsi,rcx -ALIGN 16 + $L$copy: - mov rax,QWORD[r14*8+rsi] - mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rax + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r9 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy @@ -294,6 +302,9 @@ $L$SEH_begin_bn_mul4x_mont: mov rax,rsp $L$mul4x_enter: + and r11d,0x80100 + cmp r11d,0x80100 + je NEAR $L$mulx4x_enter push rbx push rbp @@ -631,7 +642,6 @@ $L$inner4x: mov rdi,QWORD[16+r9*8+rsp] lea r15,[((-4))+r9] mov rax,QWORD[rsp] - pxor xmm0,xmm0 mov rdx,QWORD[8+rsp] shr r15,2 lea rsi,[rsp] @@ -641,8 +651,7 @@ $L$inner4x: mov rbx,QWORD[16+rsi] mov rbp,QWORD[24+rsi] sbb rdx,QWORD[8+rcx] - jmp NEAR $L$sub4x -ALIGN 16 + $L$sub4x: mov QWORD[r14*8+rdi],rax mov QWORD[8+r14*8+rdi],rdx @@ -669,34 +678,35 @@ $L$sub4x: sbb rax,0 mov QWORD[24+r14*8+rdi],rbp - xor r14,r14 - and rsi,rax - not rax - mov rcx,rdi - and rcx,rax - lea r15,[((-4))+r9] - or rsi,rcx + pxor xmm0,xmm0 +DB 102,72,15,110,224 + pcmpeqd xmm5,xmm5 + pshufd xmm4,xmm4,0 + mov r15,r9 + pxor xmm5,xmm4 shr r15,2 + xor eax,eax - movdqu xmm1,XMMWORD[rsi] - movdqa XMMWORD[rsp],xmm0 - movdqu XMMWORD[rdi],xmm1 jmp NEAR $L$copy4x ALIGN 16 $L$copy4x: - movdqu xmm2,XMMWORD[16+r14*1+rsi] - movdqu xmm1,XMMWORD[32+r14*1+rsi] - movdqa XMMWORD[16+r14*1+rsp],xmm0 - movdqu XMMWORD[16+r14*1+rdi],xmm2 - movdqa XMMWORD[32+r14*1+rsp],xmm0 - movdqu XMMWORD[32+r14*1+rdi],xmm1 - lea r14,[32+r14] + movdqa xmm1,XMMWORD[rax*1+rsp] + movdqu xmm2,XMMWORD[rax*1+rdi] + pand xmm1,xmm4 + pand xmm2,xmm5 + movdqa xmm3,XMMWORD[16+rax*1+rsp] + movdqa XMMWORD[rax*1+rsp],xmm0 + por xmm1,xmm2 + movdqu xmm2,XMMWORD[16+rax*1+rdi] + movdqu XMMWORD[rax*1+rdi],xmm1 + pand xmm3,xmm4 + pand xmm2,xmm5 + movdqa XMMWORD[16+rax*1+rsp],xmm0 + por xmm3,xmm2 + movdqu XMMWORD[16+rax*1+rdi],xmm3 + lea rax,[32+rax] dec r15 jnz NEAR $L$copy4x - - movdqu xmm2,XMMWORD[16+r14*1+rsi] - movdqa XMMWORD[16+r14*1+rsp],xmm0 - movdqu XMMWORD[16+r14*1+rdi],xmm2 mov rsi,QWORD[8+r9*8+rsp] mov rax,1 @@ -720,6 +730,7 @@ $L$mul4x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_mul4x_mont: +EXTERN bn_sqrx8x_internal EXTERN bn_sqr8x_internal @@ -815,6 +826,26 @@ DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 + lea rax,[OPENSSL_ia32cap_P] + mov eax,DWORD[8+rax] + and eax,0x80100 + cmp eax,0x80100 + jne NEAR $L$sqr8x_nox + + call bn_sqrx8x_internal + + + + + lea rbx,[rcx*1+r8] + mov r9,rcx + mov rdx,rcx +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_nox: call bn_sqr8x_internal @@ -904,6 +935,376 @@ $L$sqr8x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_sqr8x_mont: + +ALIGN 32 +bn_mulx4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$mulx4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + xor r10,r10 + sub r10,r9 + mov r8,QWORD[r8] + lea rbp,[((-72))+r10*1+rsp] + and rbp,-128 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +ALIGN 16 +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + lea r10,[r9*1+rdx] + + + + + + + + + + + + + mov QWORD[rsp],r9 + shr r9,5 + mov QWORD[16+rsp],r10 + sub r9,1 + mov QWORD[24+rsp],r8 + mov QWORD[32+rsp],rdi + mov QWORD[40+rsp],rax + + mov QWORD[48+rsp],r9 + jmp NEAR $L$mulx4x_body + +ALIGN 32 +$L$mulx4x_body: + lea rdi,[8+rdx] + mov rdx,QWORD[rdx] + lea rbx,[((64+32))+rsp] + mov r9,rdx + + mulx rax,r8,QWORD[rsi] + mulx r14,r11,QWORD[8+rsi] + add r11,rax + mov QWORD[8+rsp],rdi + mulx r13,r12,QWORD[16+rsi] + adc r12,r14 + adc r13,0 + + mov rdi,r8 + imul r8,QWORD[24+rsp] + xor rbp,rbp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx rdi,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 +DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + mov rdi,QWORD[48+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] +DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + add r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + mov rdx,QWORD[rdi] + lea rdi,[8+rdi] + sub rsi,rax + mov QWORD[rbx],r15 + lea rbx,[((64+32))+rsp] + sub rcx,rax + + mulx r11,r8,QWORD[rsi] + xor ebp,ebp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + adox r12,QWORD[((-16))+rbx] + adcx r13,rbp + adox r13,rbp + + mov QWORD[8+rsp],rdi + mov r15,r8 + imul r8,QWORD[24+rsp] + xor ebp,ebp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + adcx r13,rax + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + lea rsi,[32+rsi] + adox r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + lea rcx,[32+rcx] + adcx r12,rax + adox r15,rbp + mov rdi,QWORD[48+rsp] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-32))+rbx],r11 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + sub rbp,QWORD[rbx] + adc r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + + cmp rdi,QWORD[16+rsp] + jne NEAR $L$mulx4x_outer + + lea rbx,[64+rsp] + sub rcx,rax + neg r15 + mov rdx,rax + shr rax,3+2 + mov rdi,QWORD[32+rsp] + jmp NEAR $L$mulx4x_sub + +ALIGN 32 +$L$mulx4x_sub: + mov r11,QWORD[rbx] + mov r12,QWORD[8+rbx] + mov r13,QWORD[16+rbx] + mov r14,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r11,QWORD[rcx] + sbb r12,QWORD[8+rcx] + sbb r13,QWORD[16+rcx] + sbb r14,QWORD[24+rcx] + lea rcx,[32+rcx] + mov QWORD[rdi],r11 + mov QWORD[8+rdi],r12 + mov QWORD[16+rdi],r13 + mov QWORD[24+rdi],r14 + lea rdi,[32+rdi] + dec rax + jnz NEAR $L$mulx4x_sub + + sbb r15,0 + lea rbx,[64+rsp] + sub rdi,rdx + +DB 102,73,15,110,207 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$mulx4x_cond_copy + +ALIGN 32 +$L$mulx4x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + sub rdx,32 + jnz NEAR $L$mulx4x_cond_copy + + mov QWORD[rbx],rdx + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mulx4x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 @@ -1055,6 +1456,9 @@ ALIGN 4 DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_bn_mul_mont: @@ -1070,3 +1474,8 @@ DB 9,0,0,0 DD sqr_handler wrt ..imagebase DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase ALIGN 8 +$L$SEH_info_bn_mulx4x_mont: +DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm index 1bcbc5d097..7a1d5dbd9c 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm @@ -1,7 +1,14 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + default rel %define XMMWORD %define YMMWORD %define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif section .text code align=64 @@ -29,6 +36,8 @@ $L$SEH_begin_bn_mul_mont_gather5: test r9d,7 jnz NEAR $L$mul_enter + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] jmp NEAR $L$mul4x_enter ALIGN 16 @@ -410,8 +419,7 @@ $L$inner_enter: mov r15,r9 jmp NEAR $L$sub ALIGN 16 -$L$sub: - sbb rax,QWORD[r14*8+rcx] +$L$sub: sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax mov rax,QWORD[8+r14*8+rsi] lea r14,[1+r14] @@ -419,18 +427,19 @@ $L$sub: jnz NEAR $L$sub sbb rax,0 + mov rbx,-1 + xor rbx,rax xor r14,r14 - and rsi,rax - not rax - mov rcx,rdi - and rcx,rax mov r15,r9 - or rsi,rcx -ALIGN 16 + $L$copy: - mov rax,QWORD[r14*8+rsi] + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rax + or rdx,rcx + mov QWORD[r14*8+rdi],rdx lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy @@ -479,6 +488,9 @@ DB 0x67 mov rax,rsp $L$mul4x_enter: + and r11d,0x80108 + cmp r11d,0x80108 + je NEAR $L$mulx4x_enter push rbx push rbp @@ -579,6 +591,7 @@ $L$SEH_end_bn_mul4x_mont_gather5: ALIGN 32 mul4x_internal: + shl r9,5 movd xmm5,DWORD[56+rax] lea rax,[$L$inc] @@ -1101,6 +1114,7 @@ $L$inner4x: mov r15,QWORD[24+rbp] jmp NEAR $L$sqr4x_sub_entry + global bn_power5 ALIGN 32 @@ -1120,6 +1134,11 @@ $L$SEH_begin_bn_power5: mov rax,rsp + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] + and r11d,0x80108 + cmp r11d,0x80108 + je NEAR $L$powerx5_enter push rbx push rbp @@ -1323,6 +1342,7 @@ __bn_sqr8x_internal: + lea rbp,[32+r10] @@ -2028,8 +2048,10 @@ DB 102,73,15,126,217 DB 0F3h,0C3h ;repret + ALIGN 32 __bn_post4x_internal: + mov r12,QWORD[rbp] lea rbx,[r9*1+rdi] mov rcx,r9 @@ -2081,10 +2103,12 @@ $L$sqr4x_sub_entry: neg r9 DB 0F3h,0C3h ;repret + global bn_from_montgomery ALIGN 32 bn_from_montgomery: + test DWORD[48+rsp],7 jz NEAR bn_from_mont8x xor eax,eax @@ -2092,6 +2116,7 @@ bn_from_montgomery: + ALIGN 32 bn_from_mont8x: mov QWORD[8+rsp],rdi ;WIN64 prologue @@ -2217,6 +2242,22 @@ DB 102,72,15,110,209 DB 0x67 mov rbp,rcx DB 102,73,15,110,218 + lea r11,[OPENSSL_ia32cap_P] + mov r11d,DWORD[8+r11] + and r11d,0x80108 + cmp r11d,0x80108 + jne NEAR $L$from_mont_nox + + lea rdi,[r9*1+rax] + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor xmm0,xmm0 + lea rax,[48+rsp] + jmp NEAR $L$from_mont_zero + +ALIGN 32 +$L$from_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2257,10 +2298,1382 @@ $L$from_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_from_mont8x: + +ALIGN 32 +bn_mulx4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$mulx4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mulx4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mulx4xsp_done + +$L$mulx4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mulx4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + + + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$mulx4x_body: + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mulx4x_mont_gather5: + + +ALIGN 32 +mulx4x_internal: + + mov QWORD[8+rsp],r9 + mov r10,r9 + neg r9 + shl r9,5 + neg r10 + lea r13,[128+r9*1+rdx] + shr r9,5+5 + movd xmm5,DWORD[56+rax] + sub r9,1 + lea rax,[$L$inc] + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((24+8))+rsp],r9 + mov QWORD[((56+8))+rsp],rdi + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r10*1+rsp] + lea rdi,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 0x67 + movdqa xmm2,xmm1 +DB 0x67 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 +DB 0x67 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + + pand xmm0,XMMWORD[64+rdi] + pand xmm1,XMMWORD[80+rdi] + pand xmm2,XMMWORD[96+rdi] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+rdi] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+rdi] + movdqa xmm5,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+rdi] + movdqa xmm5,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[rdi] + movdqa xmm5,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + pxor xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + lea rbx,[((64+32+8))+rsp] + + mov r9,rdx + mulx rax,r8,QWORD[rsi] + mulx r12,r11,QWORD[8+rsi] + add r11,rax + mulx r13,rax,QWORD[16+rsi] + adc r12,rax + adc r13,0 + mulx r14,rax,QWORD[24+rsi] + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + xor rbp,rbp + mov rdx,r8 + + mov QWORD[((8+8))+rsp],rdi + + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] +DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[8+rsp] + adc r15,rbp + lea rsi,[rax*1+rsi] + add r14,r15 + mov rdi,QWORD[((8+8))+rsp] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + lea r10,[((16-256))+rbx] + pxor xmm4,xmm4 +DB 0x67,0x67 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+rdi] + movdqa xmm1,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm0,XMMWORD[256+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm1,XMMWORD[272+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[288+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[304+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+rdi] + movdqa xmm1,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm0,XMMWORD[320+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm1,XMMWORD[336+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[352+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[368+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[rdi] + movdqa xmm1,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm0,XMMWORD[384+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm1,XMMWORD[400+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[416+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[432+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+rdi] + movdqa xmm1,XMMWORD[80+rdi] + movdqa xmm2,XMMWORD[96+rdi] + pand xmm0,XMMWORD[448+r10] + movdqa xmm3,XMMWORD[112+rdi] + pand xmm1,XMMWORD[464+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[480+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[496+r10] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + + mov QWORD[rbx],rbp + lea rbx,[32+rax*1+rbx] + mulx r11,r8,QWORD[rsi] + xor rbp,rbp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + mulx r14,rdx,QWORD[24+rsi] + adox r12,QWORD[((-16))+rbx] + adcx r13,rdx + lea rcx,[rax*1+rcx] + lea rsi,[32+rsi] + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + adox r14,rbp + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + + mov rdx,r8 + xor rbp,rbp + mov QWORD[((8+8))+rsp],rdi + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r12,rax + mov QWORD[((-24))+rbx],r11 + adox r15,rbp + mov QWORD[((-16))+rbx],r12 + lea rcx,[32+rcx] + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mov QWORD[((-32))+rbx],r11 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + lea rcx,[32+rcx] + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[((0+8))+rsp] + adc r15,rbp + sub rdi,QWORD[rbx] + mov rdi,QWORD[((8+8))+rsp] + mov r10,QWORD[((16+8))+rsp] + adc r14,r15 + lea rsi,[rax*1+rsi] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + + cmp rdi,r10 + jb NEAR $L$mulx4x_outer + + mov r10,QWORD[((-8))+rcx] + mov r8,rbp + mov r12,QWORD[rax*1+rcx] + lea rbp,[rax*1+rcx] + mov rcx,rax + lea rdi,[rax*1+rbx] + xor eax,eax + xor r15,r15 + sub r10,r14 + adc r15,r15 + or r8,r15 + sar rcx,3+2 + sub rax,r8 + mov rdx,QWORD[((56+8))+rsp] + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + + + +ALIGN 32 +bn_powerx5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_powerx5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$powerx5_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$powerx5_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwrx_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwrx_sp_done + +ALIGN 32 +$L$pwrx_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwrx_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk + jmp NEAR $L$pwrx_page_walk_done + +$L$pwrx_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk +$L$pwrx_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + + + pxor xmm0,xmm0 +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 102,73,15,110,218 +DB 102,72,15,110,226 + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + mov r9,r10 + mov rdi,rsi +DB 102,72,15,126,209 +DB 102,72,15,126,226 + mov rax,QWORD[40+rsp] + + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$powerx5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_powerx5: + +global bn_sqrx8x_internal + + +ALIGN 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rdi,[((48+8))+rsp] + lea rbp,[r9*1+rsi] + mov QWORD[((0+8))+rsp],r9 + mov QWORD[((8+8))+rsp],rbp + jmp NEAR $L$sqr8x_zero_start + +ALIGN 32 +DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +$L$sqrx8x_zero: +DB 0x3e + movdqa XMMWORD[rdi],xmm0 + movdqa XMMWORD[16+rdi],xmm0 + movdqa XMMWORD[32+rdi],xmm0 + movdqa XMMWORD[48+rdi],xmm0 +$L$sqr8x_zero_start: + movdqa XMMWORD[64+rdi],xmm0 + movdqa XMMWORD[80+rdi],xmm0 + movdqa XMMWORD[96+rdi],xmm0 + movdqa XMMWORD[112+rdi],xmm0 + lea rdi,[128+rdi] + sub r9,64 + jnz NEAR $L$sqrx8x_zero + + mov rdx,QWORD[rsi] + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + lea rdi,[((48+8))+rsp] + xor rbp,rbp + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_loop: + mulx rax,r8,QWORD[8+rsi] + adcx r8,r9 + adox r10,rax + mulx rax,r9,QWORD[16+rsi] + adcx r9,r10 + adox r11,rax +DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcx r10,r11 + adox r12,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcx r11,r12 + adox r13,rax + mulx rax,r12,QWORD[40+rsi] + adcx r12,r13 + adox r14,rax + mulx rax,r13,QWORD[48+rsi] + adcx r13,r14 + adox rax,r15 + mulx r15,r14,QWORD[56+rsi] + mov rdx,QWORD[8+rsi] + adcx r14,rax + adox r15,rbp + adc r15,QWORD[64+rdi] + mov QWORD[8+rdi],r8 + mov QWORD[16+rdi],r9 + sbb rcx,rcx + xor rbp,rbp + + + mulx rbx,r8,QWORD[16+rsi] + mulx rax,r9,QWORD[24+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[32+rsi] + adcx r9,r11 + adox r10,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcx r10,r12 + adox r11,rbx +DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcx r11,r13 + adox r12,r14 +DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + mov rdx,QWORD[16+rsi] + adcx r12,rax + adox r13,rbx + adcx r13,r15 + adox r14,rbp + adcx r14,rbp + + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + + mulx rbx,r8,QWORD[24+rsi] + mulx rax,r9,QWORD[32+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[40+rsi] + adcx r9,r11 + adox r10,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcx r10,r12 + adox r11,r13 +DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +DB 0x3e + mov rdx,QWORD[24+rsi] + adcx r11,rbx + adox r12,rax + adcx r12,r14 + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mulx rax,r8,QWORD[32+rsi] + adox r13,rbp + adcx r13,rbp + + mulx rbx,r9,QWORD[40+rsi] + adcx r8,r10 + adox r9,rax + mulx rax,r10,QWORD[48+rsi] + adcx r9,r11 + adox r10,r12 + mulx r12,r11,QWORD[56+rsi] + mov rdx,QWORD[32+rsi] + mov r14,QWORD[40+rsi] + adcx r10,rbx + adox r11,rax + mov r15,QWORD[48+rsi] + adcx r11,r13 + adox r12,rbp + adcx r12,rbp + + mov QWORD[56+rdi],r8 + mov QWORD[64+rdi],r9 + + mulx rax,r9,r14 + mov r8,QWORD[56+rsi] + adcx r9,r10 + mulx rbx,r10,r15 + adox r10,rax + adcx r10,r11 + mulx rax,r11,r8 + mov rdx,r14 + adox r11,rbx + adcx r11,r12 + + adcx rax,rbp + + mulx rbx,r14,r15 + mulx r13,r12,r8 + mov rdx,r15 + lea rsi,[64+rsi] + adcx r11,r14 + adox r12,rbx + adcx r12,rax + adox r13,rbp + +DB 0x67,0x67 + mulx r14,r8,r8 + adcx r13,r8 + adcx r14,rbp + + cmp rsi,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_outer_break + + neg rcx + mov rcx,-8 + mov r15,rbp + mov r8,QWORD[64+rdi] + adcx r9,QWORD[72+rdi] + adcx r10,QWORD[80+rdi] + adcx r11,QWORD[88+rdi] + adc r12,QWORD[96+rdi] + adc r13,QWORD[104+rdi] + adc r14,QWORD[112+rdi] + adc r15,QWORD[120+rdi] + lea rbp,[rsi] + lea rdi,[128+rdi] + sbb rax,rax + + mov rdx,QWORD[((-64))+rsi] + mov QWORD[((16+8))+rsp],rax + mov QWORD[((24+8))+rsp],rdi + + + xor eax,eax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_loop: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + +DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + mov QWORD[rcx*8+rdi],rbx + mov ebx,0 + adcx r13,rax + adox r14,r15 + +DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + mov rdx,QWORD[8+rcx*8+rsi] + adcx r14,rax + adox r15,rbx + adcx r15,rbx + +DB 0x67 + inc rcx + jnz NEAR $L$sqrx8x_loop + + lea rbp,[64+rbp] + mov rcx,-8 + cmp rbp,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_break + + sub rbx,QWORD[((16+8))+rsp] +DB 0x66 + mov rdx,QWORD[((-64))+rsi] + adcx r8,QWORD[rdi] + adcx r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] +DB 0x67 + sbb rax,rax + xor ebx,ebx + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_break: + xor rbp,rbp + sub rbx,QWORD[((16+8))+rsp] + adcx r8,rbp + mov rcx,QWORD[((24+8))+rsp] + adcx r9,rbp + mov rdx,QWORD[rsi] + adc r10,0 + mov QWORD[rdi],r8 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + cmp rdi,rcx + je NEAR $L$sqrx8x_outer_loop + + mov QWORD[8+rdi],r9 + mov r9,QWORD[8+rcx] + mov QWORD[16+rdi],r10 + mov r10,QWORD[16+rcx] + mov QWORD[24+rdi],r11 + mov r11,QWORD[24+rcx] + mov QWORD[32+rdi],r12 + mov r12,QWORD[32+rcx] + mov QWORD[40+rdi],r13 + mov r13,QWORD[40+rcx] + mov QWORD[48+rdi],r14 + mov r14,QWORD[48+rcx] + mov QWORD[56+rdi],r15 + mov r15,QWORD[56+rcx] + mov rdi,rcx + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_break: + mov QWORD[72+rdi],r9 +DB 102,72,15,126,217 + mov QWORD[80+rdi],r10 + mov QWORD[88+rdi],r11 + mov QWORD[96+rdi],r12 + mov QWORD[104+rdi],r13 + mov QWORD[112+rdi],r14 + lea rdi,[((48+8))+rsp] + mov rdx,QWORD[rcx*1+rsi] + + mov r11,QWORD[8+rdi] + xor r10,r10 + mov r9,QWORD[((0+8))+rsp] + adox r11,r11 + mov r12,QWORD[16+rdi] + mov r13,QWORD[24+rdi] + + +ALIGN 32 +$L$sqrx4x_shift_n_add: + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 +DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[40+rdi] + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + mov rdx,QWORD[16+rcx*1+rsi] + mov r12,QWORD[48+rdi] + adox r11,r11 + adcx rbx,r13 + mov r13,QWORD[56+rdi] + mov QWORD[16+rdi],rax + mov QWORD[24+rdi],rbx + + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + mov rdx,QWORD[24+rcx*1+rsi] + lea rcx,[32+rcx] + mov r10,QWORD[64+rdi] + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[72+rdi] + mov QWORD[32+rdi],rax + mov QWORD[40+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + jrcxz $L$sqrx4x_shift_n_add_break +DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adox r11,r11 + adcx rbx,r13 + mov r12,QWORD[80+rdi] + mov r13,QWORD[88+rdi] + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] + nop + jmp NEAR $L$sqrx4x_shift_n_add + +ALIGN 32 +$L$sqrx4x_shift_n_add_break: + adcx rbx,r13 + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] +DB 102,72,15,126,213 +__bn_sqrx8x_reduction: + xor eax,eax + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rcx,[((-64))+r9*1+rbp] + + mov QWORD[((0+8))+rsp],rcx + mov QWORD[((8+8))+rsp],rdi + + lea rdi,[((48+8))+rsp] + jmp NEAR $L$sqrx8x_reduction_loop + +ALIGN 32 +$L$sqrx8x_reduction_loop: + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r8,rdx + imul rdx,rbx + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[((24+8))+rsp],rax + + lea rdi,[64+rdi] + xor rsi,rsi + mov rcx,-8 + jmp NEAR $L$sqrx8x_reduce + +ALIGN 32 +$L$sqrx8x_reduce: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rax,rbx + adox r8,r9 + + mulx r9,rbx,QWORD[8+rbp] + adcx r8,rbx + adox r9,r10 + + mulx r10,rbx,QWORD[16+rbp] + adcx r9,rbx + adox r10,r11 + + mulx r11,rbx,QWORD[24+rbp] + adcx r10,rbx + adox r11,r12 + +DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + mov rax,rdx + mov rdx,r8 + adcx r11,rbx + adox r12,r13 + + mulx rdx,rbx,QWORD[((32+8))+rsp] + mov rdx,rax + mov QWORD[((64+48+8))+rcx*8+rsp],rax + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,rbx + adcx r14,rax + adox r15,rsi + adcx r15,rsi + +DB 0x67,0x67,0x67 + inc rcx + jnz NEAR $L$sqrx8x_reduce + + mov rax,rsi + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_no_tail + + mov rdx,QWORD[((48+8))+rsp] + add r8,QWORD[rdi] + lea rbp,[64+rbp] + mov rcx,-8 + adcx r9,QWORD[8+rdi] + adcx r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + +DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,QWORD[((72+48+8))+rcx*8+rsp] + adcx r14,rax + adox r15,rsi + mov QWORD[rcx*8+rdi],rbx + mov rbx,r8 + adcx r15,rsi + + inc rcx + jnz NEAR $L$sqrx8x_tail + + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_tail_done + + sub rsi,QWORD[((16+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rbp,[64+rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + sub rcx,8 + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail_done: + xor rax,rax + add r8,QWORD[((24+8))+rsp] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + sub rsi,QWORD[((16+8))+rsp] +$L$sqrx8x_no_tail: + adc r8,QWORD[rdi] +DB 102,72,15,126,217 + adc r9,QWORD[8+rdi] + mov rsi,QWORD[56+rbp] +DB 102,72,15,126,213 + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[64+rcx*1+rdi] + + mov QWORD[rdi],r8 + lea r8,[64+rdi] + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + lea rdi,[64+rcx*1+rdi] + cmp r8,QWORD[((8+8))+rsp] + jb NEAR $L$sqrx8x_reduction_loop + DB 0F3h,0C3h ;repret + + +ALIGN 32 + +__bn_postx4x_internal: + + mov r12,QWORD[rbp] + mov r10,rcx + mov r9,rcx + neg rax + sar rcx,3+2 + +DB 102,72,15,126,202 +DB 102,72,15,126,206 + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + +ALIGN 16 +$L$sqrx4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqrx4x_sub_entry: + andn r12,r12,rax + lea rbp,[32+rbp] + andn r13,r13,rax + andn r14,r14,rax + andn r15,r15,rax + + neg r8 + adc r12,QWORD[rdi] + adc r13,QWORD[8+rdi] + adc r14,QWORD[16+rdi] + adc r15,QWORD[24+rdi] + mov QWORD[rdx],r12 + lea rdi,[32+rdi] + mov QWORD[8+rdx],r13 + sbb r8,r8 + mov QWORD[16+rdx],r14 + mov QWORD[24+rdx],r15 + lea rdx,[32+rdx] + + inc rcx + jnz NEAR $L$sqrx4x_sub + + neg r9 + + DB 0F3h,0C3h ;repret + + global bn_scatter5 ALIGN 16 bn_scatter5: + cmp edx,0 jz NEAR $L$scatter_epilogue lea r8,[r9*8+r8] @@ -2275,13 +3688,16 @@ $L$scatter_epilogue: DB 0F3h,0C3h ;repret + global bn_gather5 ALIGN 32 bn_gather5: + $L$SEH_begin_bn_gather5: DB 0x4c,0x8d,0x14,0x24 + DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 lea rax,[$L$inc] and rsp,-16 @@ -2435,9 +3851,11 @@ $L$gather: jnz NEAR $L$gather lea rsp,[r10] + DB 0F3h,0C3h ;repret $L$SEH_end_bn_gather5: + ALIGN 64 $L$inc: DD 0,0,1,1 @@ -2568,6 +3986,13 @@ ALIGN 4 DD $L$SEH_begin_bn_from_mont8x wrt ..imagebase DD $L$SEH_end_bn_from_mont8x wrt ..imagebase DD $L$SEH_info_bn_from_mont8x wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_powerx5 wrt ..imagebase + DD $L$SEH_end_bn_powerx5 wrt ..imagebase + DD $L$SEH_info_bn_powerx5 wrt ..imagebase DD $L$SEH_begin_bn_gather5 wrt ..imagebase DD $L$SEH_end_bn_gather5 wrt ..imagebase DD $L$SEH_info_bn_gather5 wrt ..imagebase @@ -2594,6 +4019,16 @@ DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase ALIGN 8 +$L$SEH_info_bn_mulx4x_mont_gather5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_powerx5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase +ALIGN 8 $L$SEH_info_bn_gather5: DB 0x01,0x0b,0x03,0x0a DB 0x0b,0x01,0x21,0x00 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm new file mode 100644 index 0000000000..99006695ad --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/test/trampoline-x86_64.asm @@ -0,0 +1,682 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + + + + + + + + +global abi_test_trampoline +ALIGN 16 +abi_test_trampoline: +$L$abi_test_trampoline_seh_begin: + + + + + + + + + + + sub rsp,344 + +$L$abi_test_trampoline_seh_prolog_alloc: + mov QWORD[112+rsp],rbx + +$L$abi_test_trampoline_seh_prolog_rbx: + mov QWORD[120+rsp],rbp + +$L$abi_test_trampoline_seh_prolog_rbp: + mov QWORD[128+rsp],rdi + +$L$abi_test_trampoline_seh_prolog_rdi: + mov QWORD[136+rsp],rsi + +$L$abi_test_trampoline_seh_prolog_rsi: + mov QWORD[144+rsp],r12 + +$L$abi_test_trampoline_seh_prolog_r12: + mov QWORD[152+rsp],r13 + +$L$abi_test_trampoline_seh_prolog_r13: + mov QWORD[160+rsp],r14 + +$L$abi_test_trampoline_seh_prolog_r14: + mov QWORD[168+rsp],r15 + +$L$abi_test_trampoline_seh_prolog_r15: + movdqa XMMWORD[176+rsp],xmm6 + +$L$abi_test_trampoline_seh_prolog_xmm6: + movdqa XMMWORD[192+rsp],xmm7 + +$L$abi_test_trampoline_seh_prolog_xmm7: + movdqa XMMWORD[208+rsp],xmm8 + +$L$abi_test_trampoline_seh_prolog_xmm8: + movdqa XMMWORD[224+rsp],xmm9 + +$L$abi_test_trampoline_seh_prolog_xmm9: + movdqa XMMWORD[240+rsp],xmm10 + +$L$abi_test_trampoline_seh_prolog_xmm10: + movdqa XMMWORD[256+rsp],xmm11 + +$L$abi_test_trampoline_seh_prolog_xmm11: + movdqa XMMWORD[272+rsp],xmm12 + +$L$abi_test_trampoline_seh_prolog_xmm12: + movdqa XMMWORD[288+rsp],xmm13 + +$L$abi_test_trampoline_seh_prolog_xmm13: + movdqa XMMWORD[304+rsp],xmm14 + +$L$abi_test_trampoline_seh_prolog_xmm14: + movdqa XMMWORD[320+rsp],xmm15 + +$L$abi_test_trampoline_seh_prolog_xmm15: +$L$abi_test_trampoline_seh_prolog_end: + mov rbx,QWORD[rdx] + mov rbp,QWORD[8+rdx] + mov rdi,QWORD[16+rdx] + mov rsi,QWORD[24+rdx] + mov r12,QWORD[32+rdx] + mov r13,QWORD[40+rdx] + mov r14,QWORD[48+rdx] + mov r15,QWORD[56+rdx] + movdqa xmm6,XMMWORD[64+rdx] + movdqa xmm7,XMMWORD[80+rdx] + movdqa xmm8,XMMWORD[96+rdx] + movdqa xmm9,XMMWORD[112+rdx] + movdqa xmm10,XMMWORD[128+rdx] + movdqa xmm11,XMMWORD[144+rdx] + movdqa xmm12,XMMWORD[160+rdx] + movdqa xmm13,XMMWORD[176+rdx] + movdqa xmm14,XMMWORD[192+rdx] + movdqa xmm15,XMMWORD[208+rdx] + + mov QWORD[88+rsp],rcx + mov QWORD[96+rsp],rdx + + + + + mov r10,r8 + mov r11,r9 + dec r11 + js NEAR $L$args_done + mov rcx,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov rdx,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov r8,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov r9,QWORD[r10] + add r10,8 + lea rax,[32+rsp] +$L$args_loop: + dec r11 + js NEAR $L$args_done + + + + + + + mov QWORD[104+rsp],r11 + mov r11,QWORD[r10] + mov QWORD[rax],r11 + mov r11,QWORD[104+rsp] + + add r10,8 + add rax,8 + jmp NEAR $L$args_loop + +$L$args_done: + mov rax,QWORD[88+rsp] + mov r10,QWORD[384+rsp] + test r10,r10 + jz NEAR $L$no_unwind + + + pushfq + or QWORD[rsp],0x100 + popfq + + + + nop +global abi_test_unwind_start +abi_test_unwind_start: + + call rax +global abi_test_unwind_return +abi_test_unwind_return: + + + + + pushfq + and QWORD[rsp],-0x101 + popfq +global abi_test_unwind_stop +abi_test_unwind_stop: + + jmp NEAR $L$call_done + +$L$no_unwind: + call rax + +$L$call_done: + + mov rdx,QWORD[96+rsp] + mov QWORD[rdx],rbx + mov QWORD[8+rdx],rbp + mov QWORD[16+rdx],rdi + mov QWORD[24+rdx],rsi + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],r14 + mov QWORD[56+rdx],r15 + movdqa XMMWORD[64+rdx],xmm6 + movdqa XMMWORD[80+rdx],xmm7 + movdqa XMMWORD[96+rdx],xmm8 + movdqa XMMWORD[112+rdx],xmm9 + movdqa XMMWORD[128+rdx],xmm10 + movdqa XMMWORD[144+rdx],xmm11 + movdqa XMMWORD[160+rdx],xmm12 + movdqa XMMWORD[176+rdx],xmm13 + movdqa XMMWORD[192+rdx],xmm14 + movdqa XMMWORD[208+rdx],xmm15 + mov rbx,QWORD[112+rsp] + + mov rbp,QWORD[120+rsp] + + mov rdi,QWORD[128+rsp] + + mov rsi,QWORD[136+rsp] + + mov r12,QWORD[144+rsp] + + mov r13,QWORD[152+rsp] + + mov r14,QWORD[160+rsp] + + mov r15,QWORD[168+rsp] + + movdqa xmm6,XMMWORD[176+rsp] + + movdqa xmm7,XMMWORD[192+rsp] + + movdqa xmm8,XMMWORD[208+rsp] + + movdqa xmm9,XMMWORD[224+rsp] + + movdqa xmm10,XMMWORD[240+rsp] + + movdqa xmm11,XMMWORD[256+rsp] + + movdqa xmm12,XMMWORD[272+rsp] + + movdqa xmm13,XMMWORD[288+rsp] + + movdqa xmm14,XMMWORD[304+rsp] + + movdqa xmm15,XMMWORD[320+rsp] + + add rsp,344 + + + + DB 0F3h,0C3h ;repret + +$L$abi_test_trampoline_seh_end: + + +global abi_test_clobber_rax +ALIGN 16 +abi_test_clobber_rax: + xor rax,rax + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rbx +ALIGN 16 +abi_test_clobber_rbx: + xor rbx,rbx + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rcx +ALIGN 16 +abi_test_clobber_rcx: + xor rcx,rcx + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rdx +ALIGN 16 +abi_test_clobber_rdx: + xor rdx,rdx + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rdi +ALIGN 16 +abi_test_clobber_rdi: + xor rdi,rdi + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rsi +ALIGN 16 +abi_test_clobber_rsi: + xor rsi,rsi + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_rbp +ALIGN 16 +abi_test_clobber_rbp: + xor rbp,rbp + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r8 +ALIGN 16 +abi_test_clobber_r8: + xor r8,r8 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r9 +ALIGN 16 +abi_test_clobber_r9: + xor r9,r9 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r10 +ALIGN 16 +abi_test_clobber_r10: + xor r10,r10 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r11 +ALIGN 16 +abi_test_clobber_r11: + xor r11,r11 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r12 +ALIGN 16 +abi_test_clobber_r12: + xor r12,r12 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r13 +ALIGN 16 +abi_test_clobber_r13: + xor r13,r13 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r14 +ALIGN 16 +abi_test_clobber_r14: + xor r14,r14 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_r15 +ALIGN 16 +abi_test_clobber_r15: + xor r15,r15 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm0 +ALIGN 16 +abi_test_clobber_xmm0: + pxor xmm0,xmm0 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm1 +ALIGN 16 +abi_test_clobber_xmm1: + pxor xmm1,xmm1 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm2 +ALIGN 16 +abi_test_clobber_xmm2: + pxor xmm2,xmm2 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm3 +ALIGN 16 +abi_test_clobber_xmm3: + pxor xmm3,xmm3 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm4 +ALIGN 16 +abi_test_clobber_xmm4: + pxor xmm4,xmm4 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm5 +ALIGN 16 +abi_test_clobber_xmm5: + pxor xmm5,xmm5 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm6 +ALIGN 16 +abi_test_clobber_xmm6: + pxor xmm6,xmm6 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm7 +ALIGN 16 +abi_test_clobber_xmm7: + pxor xmm7,xmm7 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm8 +ALIGN 16 +abi_test_clobber_xmm8: + pxor xmm8,xmm8 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm9 +ALIGN 16 +abi_test_clobber_xmm9: + pxor xmm9,xmm9 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm10 +ALIGN 16 +abi_test_clobber_xmm10: + pxor xmm10,xmm10 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm11 +ALIGN 16 +abi_test_clobber_xmm11: + pxor xmm11,xmm11 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm12 +ALIGN 16 +abi_test_clobber_xmm12: + pxor xmm12,xmm12 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm13 +ALIGN 16 +abi_test_clobber_xmm13: + pxor xmm13,xmm13 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm14 +ALIGN 16 +abi_test_clobber_xmm14: + pxor xmm14,xmm14 + DB 0F3h,0C3h ;repret + + +global abi_test_clobber_xmm15 +ALIGN 16 +abi_test_clobber_xmm15: + pxor xmm15,xmm15 + DB 0F3h,0C3h ;repret + + + + + +global abi_test_bad_unwind_wrong_register +ALIGN 16 +abi_test_bad_unwind_wrong_register: + +$L$abi_test_bad_unwind_wrong_register_seh_begin: + push r12 + +$L$abi_test_bad_unwind_wrong_register_seh_push_r13: + + + + nop + pop r12 + + DB 0F3h,0C3h ;repret +$L$abi_test_bad_unwind_wrong_register_seh_end: + + + + + + + +global abi_test_bad_unwind_temporary +ALIGN 16 +abi_test_bad_unwind_temporary: + +$L$abi_test_bad_unwind_temporary_seh_begin: + push r12 + +$L$abi_test_bad_unwind_temporary_seh_push_r12: + + mov rax,r12 + inc rax + mov QWORD[rsp],rax + + + + mov QWORD[rsp],r12 + + + pop r12 + + DB 0F3h,0C3h ;repret +$L$abi_test_bad_unwind_temporary_seh_end: + + + + + + + +global abi_test_get_and_clear_direction_flag +abi_test_get_and_clear_direction_flag: + pushfq + pop rax + and rax,0x400 + shr rax,10 + cld + DB 0F3h,0C3h ;repret + + + + + +global abi_test_set_direction_flag +abi_test_set_direction_flag: + std + DB 0F3h,0C3h ;repret + + + + + + +global abi_test_bad_unwind_epilog +ALIGN 16 +abi_test_bad_unwind_epilog: +$L$abi_test_bad_unwind_epilog_seh_begin: + push r12 +$L$abi_test_bad_unwind_epilog_seh_push_r12: + + nop + + + pop r12 + nop + DB 0F3h,0C3h ;repret +$L$abi_test_bad_unwind_epilog_seh_end: + +section .pdata rdata align=4 +ALIGN 4 + + DD $L$abi_test_trampoline_seh_begin wrt ..imagebase + DD $L$abi_test_trampoline_seh_end wrt ..imagebase + DD $L$abi_test_trampoline_seh_info wrt ..imagebase + + DD $L$abi_test_bad_unwind_wrong_register_seh_begin wrt ..imagebase + DD $L$abi_test_bad_unwind_wrong_register_seh_end wrt ..imagebase + DD $L$abi_test_bad_unwind_wrong_register_seh_info wrt ..imagebase + + DD $L$abi_test_bad_unwind_temporary_seh_begin wrt ..imagebase + DD $L$abi_test_bad_unwind_temporary_seh_end wrt ..imagebase + DD $L$abi_test_bad_unwind_temporary_seh_info wrt ..imagebase + + DD $L$abi_test_bad_unwind_epilog_seh_begin wrt ..imagebase + DD $L$abi_test_bad_unwind_epilog_seh_end wrt ..imagebase + DD $L$abi_test_bad_unwind_epilog_seh_info wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$abi_test_trampoline_seh_info: + +DB 1 +DB $L$abi_test_trampoline_seh_prolog_end-$L$abi_test_trampoline_seh_begin +DB 38 +DB 0 +DB $L$abi_test_trampoline_seh_prolog_xmm15-$L$abi_test_trampoline_seh_begin +DB 248 + DW 20 +DB $L$abi_test_trampoline_seh_prolog_xmm14-$L$abi_test_trampoline_seh_begin +DB 232 + DW 19 +DB $L$abi_test_trampoline_seh_prolog_xmm13-$L$abi_test_trampoline_seh_begin +DB 216 + DW 18 +DB $L$abi_test_trampoline_seh_prolog_xmm12-$L$abi_test_trampoline_seh_begin +DB 200 + DW 17 +DB $L$abi_test_trampoline_seh_prolog_xmm11-$L$abi_test_trampoline_seh_begin +DB 184 + DW 16 +DB $L$abi_test_trampoline_seh_prolog_xmm10-$L$abi_test_trampoline_seh_begin +DB 168 + DW 15 +DB $L$abi_test_trampoline_seh_prolog_xmm9-$L$abi_test_trampoline_seh_begin +DB 152 + DW 14 +DB $L$abi_test_trampoline_seh_prolog_xmm8-$L$abi_test_trampoline_seh_begin +DB 136 + DW 13 +DB $L$abi_test_trampoline_seh_prolog_xmm7-$L$abi_test_trampoline_seh_begin +DB 120 + DW 12 +DB $L$abi_test_trampoline_seh_prolog_xmm6-$L$abi_test_trampoline_seh_begin +DB 104 + DW 11 +DB $L$abi_test_trampoline_seh_prolog_r15-$L$abi_test_trampoline_seh_begin +DB 244 + DW 21 +DB $L$abi_test_trampoline_seh_prolog_r14-$L$abi_test_trampoline_seh_begin +DB 228 + DW 20 +DB $L$abi_test_trampoline_seh_prolog_r13-$L$abi_test_trampoline_seh_begin +DB 212 + DW 19 +DB $L$abi_test_trampoline_seh_prolog_r12-$L$abi_test_trampoline_seh_begin +DB 196 + DW 18 +DB $L$abi_test_trampoline_seh_prolog_rsi-$L$abi_test_trampoline_seh_begin +DB 100 + DW 17 +DB $L$abi_test_trampoline_seh_prolog_rdi-$L$abi_test_trampoline_seh_begin +DB 116 + DW 16 +DB $L$abi_test_trampoline_seh_prolog_rbp-$L$abi_test_trampoline_seh_begin +DB 84 + DW 15 +DB $L$abi_test_trampoline_seh_prolog_rbx-$L$abi_test_trampoline_seh_begin +DB 52 + DW 14 +DB $L$abi_test_trampoline_seh_prolog_alloc-$L$abi_test_trampoline_seh_begin +DB 1 + DW 43 + + +ALIGN 8 +$L$abi_test_bad_unwind_wrong_register_seh_info: +DB 1 +DB $L$abi_test_bad_unwind_wrong_register_seh_push_r13-$L$abi_test_bad_unwind_wrong_register_seh_begin +DB 1 +DB 0 + +DB $L$abi_test_bad_unwind_wrong_register_seh_push_r13-$L$abi_test_bad_unwind_wrong_register_seh_begin +DB 208 + +ALIGN 8 +$L$abi_test_bad_unwind_temporary_seh_info: +DB 1 +DB $L$abi_test_bad_unwind_temporary_seh_push_r12-$L$abi_test_bad_unwind_temporary_seh_begin +DB 1 +DB 0 + +DB $L$abi_test_bad_unwind_temporary_seh_push_r12-$L$abi_test_bad_unwind_temporary_seh_begin +DB 192 + +ALIGN 8 +$L$abi_test_bad_unwind_epilog_seh_info: +DB 1 +DB $L$abi_test_bad_unwind_epilog_seh_push_r12-$L$abi_test_bad_unwind_epilog_seh_begin +DB 1 +DB 0 + +DB $L$abi_test_bad_unwind_epilog_seh_push_r12-$L$abi_test_bad_unwind_epilog_seh_begin +DB 192 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm new file mode 100644 index 0000000000..fbfef1be13 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/third_party/sike/asm/fp-x86_64.asm @@ -0,0 +1,1951 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .text code align=64 + + + +$L$p434x2: + DQ 0xFFFFFFFFFFFFFFFE + DQ 0xFFFFFFFFFFFFFFFF + DQ 0xFB82ECF5C5FFFFFF + DQ 0xF78CB8F062B15D47 + DQ 0xD9F8BFAD038A40AC + DQ 0x0004683E4E2EE688 + + +$L$p434p1: + DQ 0xFDC1767AE3000000 + DQ 0x7BC65C783158AEA3 + DQ 0x6CFC5FD681C52056 + DQ 0x0002341F27177344 + +EXTERN OPENSSL_ia32cap_P + +global sike_fpadd + +sike_fpadd: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_fpadd: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + + xor rax,rax + + mov r8,QWORD[rdi] + add r8,QWORD[rsi] + mov r9,QWORD[8+rdi] + adc r9,QWORD[8+rsi] + mov r10,QWORD[16+rdi] + adc r10,QWORD[16+rsi] + mov r11,QWORD[24+rdi] + adc r11,QWORD[24+rsi] + mov r12,QWORD[32+rdi] + adc r12,QWORD[32+rsi] + mov r13,QWORD[40+rdi] + adc r13,QWORD[40+rsi] + mov r14,QWORD[48+rdi] + adc r14,QWORD[48+rsi] + + mov rcx,QWORD[$L$p434x2] + sub r8,rcx + mov rcx,QWORD[((8+$L$p434x2))] + sbb r9,rcx + sbb r10,rcx + mov rcx,QWORD[((16+$L$p434x2))] + sbb r11,rcx + mov rcx,QWORD[((24+$L$p434x2))] + sbb r12,rcx + mov rcx,QWORD[((32+$L$p434x2))] + sbb r13,rcx + mov rcx,QWORD[((40+$L$p434x2))] + sbb r14,rcx + + sbb rax,0 + + mov rdi,QWORD[$L$p434x2] + and rdi,rax + mov rsi,QWORD[((8+$L$p434x2))] + and rsi,rax + mov rcx,QWORD[((16+$L$p434x2))] + and rcx,rax + + add r8,rdi + mov QWORD[rdx],r8 + adc r9,rsi + mov QWORD[8+rdx],r9 + adc r10,rsi + mov QWORD[16+rdx],r10 + adc r11,rcx + mov QWORD[24+rdx],r11 + + setc cl + mov r8,QWORD[((24+$L$p434x2))] + and r8,rax + mov r9,QWORD[((32+$L$p434x2))] + and r9,rax + mov r10,QWORD[((40+$L$p434x2))] + and r10,rax + bt rcx,0 + + adc r12,r8 + mov QWORD[32+rdx],r12 + adc r13,r9 + mov QWORD[40+rdx],r13 + adc r14,r10 + mov QWORD[48+rdx],r14 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_cswap_asm + +sike_cswap_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_cswap_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + + movq xmm3,rdx + + + + + + pshufd xmm3,xmm3,68 + + movdqu xmm0,XMMWORD[rdi] + movdqu xmm1,XMMWORD[rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[rsi],xmm1 + + movdqu xmm0,XMMWORD[16+rdi] + movdqu xmm1,XMMWORD[16+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[16+rsi],xmm1 + + movdqu xmm0,XMMWORD[32+rdi] + movdqu xmm1,XMMWORD[32+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[32+rsi],xmm1 + + movdqu xmm0,XMMWORD[48+rdi] + movdqu xmm1,XMMWORD[48+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[48+rsi],xmm1 + + movdqu xmm0,XMMWORD[64+rdi] + movdqu xmm1,XMMWORD[64+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[64+rsi],xmm1 + + movdqu xmm0,XMMWORD[80+rdi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[80+rdi],xmm0 + movdqu XMMWORD[80+rsi],xmm1 + + movdqu xmm0,XMMWORD[96+rdi] + movdqu xmm1,XMMWORD[96+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[96+rdi],xmm0 + movdqu XMMWORD[96+rsi],xmm1 + + movdqu xmm0,XMMWORD[112+rdi] + movdqu xmm1,XMMWORD[112+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[112+rdi],xmm0 + movdqu XMMWORD[112+rsi],xmm1 + + movdqu xmm0,XMMWORD[128+rdi] + movdqu xmm1,XMMWORD[128+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[128+rdi],xmm0 + movdqu XMMWORD[128+rsi],xmm1 + + movdqu xmm0,XMMWORD[144+rdi] + movdqu xmm1,XMMWORD[144+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[144+rdi],xmm0 + movdqu XMMWORD[144+rsi],xmm1 + + movdqu xmm0,XMMWORD[160+rdi] + movdqu xmm1,XMMWORD[160+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[160+rdi],xmm0 + movdqu XMMWORD[160+rsi],xmm1 + + movdqu xmm0,XMMWORD[176+rdi] + movdqu xmm1,XMMWORD[176+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[176+rdi],xmm0 + movdqu XMMWORD[176+rsi],xmm1 + + movdqu xmm0,XMMWORD[192+rdi] + movdqu xmm1,XMMWORD[192+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[192+rdi],xmm0 + movdqu XMMWORD[192+rsi],xmm1 + + movdqu xmm0,XMMWORD[208+rdi] + movdqu xmm1,XMMWORD[208+rsi] + movdqa xmm2,xmm1 + pxor xmm2,xmm0 + pand xmm2,xmm3 + pxor xmm0,xmm2 + pxor xmm1,xmm2 + movdqu XMMWORD[208+rdi],xmm0 + movdqu XMMWORD[208+rsi],xmm1 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +global sike_fpsub + +sike_fpsub: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_fpsub: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + + xor rax,rax + + mov r8,QWORD[rdi] + sub r8,QWORD[rsi] + mov r9,QWORD[8+rdi] + sbb r9,QWORD[8+rsi] + mov r10,QWORD[16+rdi] + sbb r10,QWORD[16+rsi] + mov r11,QWORD[24+rdi] + sbb r11,QWORD[24+rsi] + mov r12,QWORD[32+rdi] + sbb r12,QWORD[32+rsi] + mov r13,QWORD[40+rdi] + sbb r13,QWORD[40+rsi] + mov r14,QWORD[48+rdi] + sbb r14,QWORD[48+rsi] + + sbb rax,0x0 + + mov rdi,QWORD[$L$p434x2] + and rdi,rax + mov rsi,QWORD[((8+$L$p434x2))] + and rsi,rax + mov rcx,QWORD[((16+$L$p434x2))] + and rcx,rax + + add r8,rdi + mov QWORD[rdx],r8 + adc r9,rsi + mov QWORD[8+rdx],r9 + adc r10,rsi + mov QWORD[16+rdx],r10 + adc r11,rcx + mov QWORD[24+rdx],r11 + + setc cl + mov r8,QWORD[((24+$L$p434x2))] + and r8,rax + mov r9,QWORD[((32+$L$p434x2))] + and r9,rax + mov r10,QWORD[((40+$L$p434x2))] + and r10,rax + bt rcx,0x0 + + adc r12,r8 + adc r13,r9 + adc r14,r10 + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],r14 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_mpadd_asm + +sike_mpadd_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpadd_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + mov r8,QWORD[rdi]; + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov rcx,QWORD[32+rdi] + add r8,QWORD[rsi] + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + adc r11,QWORD[24+rsi] + adc rcx,QWORD[32+rsi] + mov QWORD[rdx],r8 + mov QWORD[8+rdx],r9 + mov QWORD[16+rdx],r10 + mov QWORD[24+rdx],r11 + mov QWORD[32+rdx],rcx + + mov r8,QWORD[40+rdi] + mov r9,QWORD[48+rdi] + adc r8,QWORD[40+rsi] + adc r9,QWORD[48+rsi] + mov QWORD[40+rdx],r8 + mov QWORD[48+rdx],r9 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_mpsubx2_asm + +sike_mpsubx2_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpsubx2_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + xor rax,rax + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov rcx,QWORD[32+rdi] + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + sbb r11,QWORD[24+rsi] + sbb rcx,QWORD[32+rsi] + mov QWORD[rdx],r8 + mov QWORD[8+rdx],r9 + mov QWORD[16+rdx],r10 + mov QWORD[24+rdx],r11 + mov QWORD[32+rdx],rcx + + mov r8,QWORD[40+rdi] + mov r9,QWORD[48+rdi] + mov r10,QWORD[56+rdi] + mov r11,QWORD[64+rdi] + mov rcx,QWORD[72+rdi] + sbb r8,QWORD[40+rsi] + sbb r9,QWORD[48+rsi] + sbb r10,QWORD[56+rsi] + sbb r11,QWORD[64+rsi] + sbb rcx,QWORD[72+rsi] + mov QWORD[40+rdx],r8 + mov QWORD[48+rdx],r9 + mov QWORD[56+rdx],r10 + mov QWORD[64+rdx],r11 + mov QWORD[72+rdx],rcx + + mov r8,QWORD[80+rdi] + mov r9,QWORD[88+rdi] + mov r10,QWORD[96+rdi] + mov r11,QWORD[104+rdi] + sbb r8,QWORD[80+rsi] + sbb r9,QWORD[88+rsi] + sbb r10,QWORD[96+rsi] + sbb r11,QWORD[104+rsi] + sbb rax,0x0 + mov QWORD[80+rdx],r8 + mov QWORD[88+rdx],r9 + mov QWORD[96+rdx],r10 + mov QWORD[104+rdx],r11 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_mpdblsubx2_asm + +sike_mpdblsubx2_asm: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpdblsubx2_asm: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + + xor rax,rax + + + mov r8,QWORD[rdx] + mov r9,QWORD[8+rdx] + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + mov r12,QWORD[32+rdx] + mov r13,QWORD[40+rdx] + mov rcx,QWORD[48+rdx] + sub r8,QWORD[rdi] + sbb r9,QWORD[8+rdi] + sbb r10,QWORD[16+rdi] + sbb r11,QWORD[24+rdi] + sbb r12,QWORD[32+rdi] + sbb r13,QWORD[40+rdi] + sbb rcx,QWORD[48+rdi] + adc rax,0x0 + + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + sbb r11,QWORD[24+rsi] + sbb r12,QWORD[32+rsi] + sbb r13,QWORD[40+rsi] + sbb rcx,QWORD[48+rsi] + adc rax,0x0 + + + mov QWORD[rdx],r8 + mov QWORD[8+rdx],r9 + mov QWORD[16+rdx],r10 + mov QWORD[24+rdx],r11 + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],rcx + + + mov r8,QWORD[56+rdx] + mov r9,QWORD[64+rdx] + mov r10,QWORD[72+rdx] + mov r11,QWORD[80+rdx] + mov r12,QWORD[88+rdx] + mov r13,QWORD[96+rdx] + mov rcx,QWORD[104+rdx] + + sub r8,rax + sbb r8,QWORD[56+rdi] + sbb r9,QWORD[64+rdi] + sbb r10,QWORD[72+rdi] + sbb r11,QWORD[80+rdi] + sbb r12,QWORD[88+rdi] + sbb r13,QWORD[96+rdi] + sbb rcx,QWORD[104+rdi] + + + sub r8,QWORD[56+rsi] + sbb r9,QWORD[64+rsi] + sbb r10,QWORD[72+rsi] + sbb r11,QWORD[80+rsi] + sbb r12,QWORD[88+rsi] + sbb r13,QWORD[96+rsi] + sbb rcx,QWORD[104+rsi] + + + mov QWORD[56+rdx],r8 + mov QWORD[64+rdx],r9 + mov QWORD[72+rdx],r10 + mov QWORD[80+rdx],r11 + mov QWORD[88+rdx],r12 + mov QWORD[96+rdx],r13 + mov QWORD[104+rdx],rcx + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +$L$rdc_bdw: + + + + + + + + + xor rax,rax + mov rdx,QWORD[((0+0))+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + xor rax,rax + mov rdx,QWORD[((0+8))+rdi] + mulx rcx,r13,QWORD[((0+$L$p434p1))] + adcx r9,r13 + adcx r10,rcx + + mulx r13,rcx,QWORD[((8+$L$p434p1))] + adcx r11,r13 + adox r10,rcx + + mulx r13,rcx,QWORD[((16+$L$p434p1))] + adcx r12,r13 + adox r11,rcx + + mulx r13,rcx,QWORD[((24+$L$p434p1))] + adcx r13,rax + adox r12,rcx + adox r13,rax + + xor rcx,rcx + add r8,QWORD[24+rdi] + adc r9,QWORD[32+rdi] + adc r10,QWORD[40+rdi] + adc r11,QWORD[48+rdi] + adc r12,QWORD[56+rdi] + adc r13,QWORD[64+rdi] + adc rcx,QWORD[72+rdi] + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + mov QWORD[40+rdi],r10 + mov QWORD[48+rdi],r11 + mov QWORD[56+rdi],r12 + mov QWORD[64+rdi],r13 + mov QWORD[72+rdi],rcx + mov r8,QWORD[80+rdi] + mov r9,QWORD[88+rdi] + mov r10,QWORD[96+rdi] + mov r11,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + adc r10,0x0 + adc r11,0x0 + mov QWORD[80+rdi],r8 + mov QWORD[88+rdi],r9 + mov QWORD[96+rdi],r10 + mov QWORD[104+rdi],r11 + + xor rax,rax + mov rdx,QWORD[((16+0))+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + xor rax,rax + mov rdx,QWORD[((16+8))+rdi] + mulx rcx,r13,QWORD[((0+$L$p434p1))] + adcx r9,r13 + adcx r10,rcx + + mulx r13,rcx,QWORD[((8+$L$p434p1))] + adcx r11,r13 + adox r10,rcx + + mulx r13,rcx,QWORD[((16+$L$p434p1))] + adcx r12,r13 + adox r11,rcx + + mulx r13,rcx,QWORD[((24+$L$p434p1))] + adcx r13,rax + adox r12,rcx + adox r13,rax + + xor rcx,rcx + add r8,QWORD[40+rdi] + adc r9,QWORD[48+rdi] + adc r10,QWORD[56+rdi] + adc r11,QWORD[64+rdi] + adc r12,QWORD[72+rdi] + adc r13,QWORD[80+rdi] + adc rcx,QWORD[88+rdi] + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mov QWORD[56+rdi],r10 + mov QWORD[64+rdi],r11 + mov QWORD[72+rdi],r12 + mov QWORD[80+rdi],r13 + mov QWORD[88+rdi],rcx + mov r8,QWORD[96+rdi] + mov r9,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + mov QWORD[96+rdi],r8 + mov QWORD[104+rdi],r9 + + xor rax,rax + mov rdx,QWORD[((32+0))+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + xor rax,rax + mov rdx,QWORD[((32+8))+rdi] + mulx rcx,r13,QWORD[((0+$L$p434p1))] + adcx r9,r13 + adcx r10,rcx + + mulx r13,rcx,QWORD[((8+$L$p434p1))] + adcx r11,r13 + adox r10,rcx + + mulx r13,rcx,QWORD[((16+$L$p434p1))] + adcx r12,r13 + adox r11,rcx + + mulx r13,rcx,QWORD[((24+$L$p434p1))] + adcx r13,rax + adox r12,rcx + adox r13,rax + + xor rcx,rcx + add r8,QWORD[56+rdi] + adc r9,QWORD[64+rdi] + adc r10,QWORD[72+rdi] + adc r11,QWORD[80+rdi] + adc r12,QWORD[88+rdi] + adc r13,QWORD[96+rdi] + adc rcx,QWORD[104+rdi] + mov QWORD[rsi],r8 + mov QWORD[8+rsi],r9 + mov QWORD[72+rdi],r10 + mov QWORD[80+rdi],r11 + mov QWORD[88+rdi],r12 + mov QWORD[96+rdi],r13 + mov QWORD[104+rdi],rcx + + xor rax,rax + mov rdx,QWORD[48+rdi] + mulx r9,r8,QWORD[((0+$L$p434p1))] + mulx r10,r12,QWORD[((8+$L$p434p1))] + mulx r11,r13,QWORD[((16+$L$p434p1))] + + adox r9,r12 + adox r10,r13 + + mulx r12,r13,QWORD[((24+$L$p434p1))] + adox r11,r13 + adox r12,rax + + add r8,QWORD[72+rdi] + adc r9,QWORD[80+rdi] + adc r10,QWORD[88+rdi] + adc r11,QWORD[96+rdi] + adc r12,QWORD[104+rdi] + mov QWORD[16+rsi],r8 + mov QWORD[24+rsi],r9 + mov QWORD[32+rsi],r10 + mov QWORD[40+rsi],r11 + mov QWORD[48+rsi],r12 + + + pop r15 + + + pop r14 + + + pop r13 + + + pop r12 + + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global sike_fprdc + +sike_fprdc: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_fprdc: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + push r15 + + + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$rdc_bdw + + + + + mov r14,QWORD[((0+0))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul r14 + xor r10,r10 + mov r8,rax + mov r9,rdx + + + mov rax,QWORD[((8+$L$p434p1))] + mul r14 + xor r11,r11 + add r9,rax + adc r10,rdx + + + mov rcx,QWORD[((0+8))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul rcx + add r9,rax + adc r10,rdx + adc r11,0x0 + + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r14 + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((8+$L$p434p1))] + mul rcx + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul r14 + xor r13,r13 + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((16+$L$p434p1))] + mul rcx + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul rcx + add r12,rax + adc r13,rdx + + + xor rcx,rcx + add r8,QWORD[24+rdi] + adc r9,QWORD[32+rdi] + adc r10,QWORD[40+rdi] + adc r11,QWORD[48+rdi] + adc r12,QWORD[56+rdi] + adc r13,QWORD[64+rdi] + adc rcx,QWORD[72+rdi] + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + mov QWORD[40+rdi],r10 + mov QWORD[48+rdi],r11 + mov QWORD[56+rdi],r12 + mov QWORD[64+rdi],r13 + mov QWORD[72+rdi],rcx + mov r8,QWORD[80+rdi] + mov r9,QWORD[88+rdi] + mov r10,QWORD[96+rdi] + mov r11,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + adc r10,0x0 + adc r11,0x0 + mov QWORD[80+rdi],r8 + mov QWORD[88+rdi],r9 + mov QWORD[96+rdi],r10 + mov QWORD[104+rdi],r11 + + + mov r14,QWORD[((16+0))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul r14 + xor r10,r10 + mov r8,rax + mov r9,rdx + + + mov rax,QWORD[((8+$L$p434p1))] + mul r14 + xor r11,r11 + add r9,rax + adc r10,rdx + + + mov rcx,QWORD[((16+8))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul rcx + add r9,rax + adc r10,rdx + adc r11,0x0 + + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r14 + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((8+$L$p434p1))] + mul rcx + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul r14 + xor r13,r13 + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((16+$L$p434p1))] + mul rcx + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul rcx + add r12,rax + adc r13,rdx + + + xor rcx,rcx + add r8,QWORD[40+rdi] + adc r9,QWORD[48+rdi] + adc r10,QWORD[56+rdi] + adc r11,QWORD[64+rdi] + adc r12,QWORD[72+rdi] + adc r13,QWORD[80+rdi] + adc rcx,QWORD[88+rdi] + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mov QWORD[56+rdi],r10 + mov QWORD[64+rdi],r11 + mov QWORD[72+rdi],r12 + mov QWORD[80+rdi],r13 + mov QWORD[88+rdi],rcx + mov r8,QWORD[96+rdi] + mov r9,QWORD[104+rdi] + adc r8,0x0 + adc r9,0x0 + mov QWORD[96+rdi],r8 + mov QWORD[104+rdi],r9 + + + mov r14,QWORD[((32+0))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul r14 + xor r10,r10 + mov r8,rax + mov r9,rdx + + + mov rax,QWORD[((8+$L$p434p1))] + mul r14 + xor r11,r11 + add r9,rax + adc r10,rdx + + + mov rcx,QWORD[((32+8))+rdi] + mov rax,QWORD[((0+$L$p434p1))] + mul rcx + add r9,rax + adc r10,rdx + adc r11,0x0 + + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r14 + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((8+$L$p434p1))] + mul rcx + add r10,rax + adc r11,rdx + adc r12,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul r14 + xor r13,r13 + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((16+$L$p434p1))] + mul rcx + add r11,rax + adc r12,rdx + adc r13,0x0 + + + mov rax,QWORD[((24+$L$p434p1))] + mul rcx + add r12,rax + adc r13,rdx + + + xor rcx,rcx + add r8,QWORD[56+rdi] + adc r9,QWORD[64+rdi] + adc r10,QWORD[72+rdi] + adc r11,QWORD[80+rdi] + adc r12,QWORD[88+rdi] + adc r13,QWORD[96+rdi] + adc rcx,QWORD[104+rdi] + mov QWORD[rsi],r8 + mov QWORD[8+rsi],r9 + mov QWORD[72+rdi],r10 + mov QWORD[80+rdi],r11 + mov QWORD[88+rdi],r12 + mov QWORD[96+rdi],r13 + mov QWORD[104+rdi],rcx + + mov r13,QWORD[48+rdi] + + xor r10,r10 + mov rax,QWORD[((0+$L$p434p1))] + mul r13 + mov r8,rax + mov r9,rdx + + xor r11,r11 + mov rax,QWORD[((8+$L$p434p1))] + mul r13 + add r9,rax + adc r10,rdx + + xor r12,r12 + mov rax,QWORD[((16+$L$p434p1))] + mul r13 + add r10,rax + adc r11,rdx + + mov rax,QWORD[((24+$L$p434p1))] + mul r13 + add r11,rax + adc r12,rdx + + add r8,QWORD[72+rdi] + adc r9,QWORD[80+rdi] + adc r10,QWORD[88+rdi] + adc r11,QWORD[96+rdi] + adc r12,QWORD[104+rdi] + mov QWORD[16+rsi],r8 + mov QWORD[24+rsi],r9 + mov QWORD[32+rsi],r10 + mov QWORD[40+rsi],r11 + mov QWORD[48+rsi],r12 + + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$mul_bdw: + + + + + + + + + + mov rcx,rdx + xor rax,rax + + + mov r8,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + + push rbx + + + push rbp + + + sub rsp,96 + + + add r8,QWORD[32+rdi] + adc r9,QWORD[40+rdi] + adc r10,QWORD[48+rdi] + adc r11,0x0 + sbb rax,0x0 + mov QWORD[rsp],r8 + mov QWORD[8+rsp],r9 + mov QWORD[16+rsp],r10 + mov QWORD[24+rsp],r11 + + + xor rbx,rbx + mov r12,QWORD[rsi] + mov r13,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + add r12,QWORD[32+rsi] + adc r13,QWORD[40+rsi] + adc r14,QWORD[48+rsi] + adc r15,0x0 + sbb rbx,0x0 + mov QWORD[32+rsp],r12 + mov QWORD[40+rsp],r13 + mov QWORD[48+rsp],r14 + mov QWORD[56+rsp],r15 + + + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + + and r8,rbx + and r9,rbx + and r10,rbx + and r11,rbx + + + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + mov QWORD[64+rsp],r8 + mov QWORD[72+rsp],r9 + mov QWORD[80+rsp],r10 + mov QWORD[88+rsp],r11 + + + mov rdx,QWORD[((0+0))+rsp] + mulx r8,r9,QWORD[((32+0))+rsp] + mov QWORD[((0+0))+rsp],r9 + mulx r9,r10,QWORD[((32+8))+rsp] + xor rax,rax + adox r8,r10 + mulx r10,r11,QWORD[((32+16))+rsp] + adox r9,r11 + mulx r11,r12,QWORD[((32+24))+rsp] + adox r10,r12 + + mov rdx,QWORD[((0+8))+rsp] + mulx r13,r12,QWORD[((32+0))+rsp] + adox r11,rax + xor rax,rax + mulx r14,r15,QWORD[((32+8))+rsp] + adox r12,r8 + mov QWORD[((0+8))+rsp],r12 + adcx r13,r15 + mulx r15,rbx,QWORD[((32+16))+rsp] + adcx r14,rbx + adox r13,r9 + mulx rbx,rbp,QWORD[((32+24))+rsp] + adcx r15,rbp + adcx rbx,rax + adox r14,r10 + + mov rdx,QWORD[((0+16))+rsp] + mulx r9,r8,QWORD[((32+0))+rsp] + adox r15,r11 + adox rbx,rax + xor rax,rax + mulx r10,r11,QWORD[((32+8))+rsp] + adox r8,r13 + mov QWORD[((0+16))+rsp],r8 + adcx r9,r11 + mulx r11,r12,QWORD[((32+16))+rsp] + adcx r10,r12 + adox r9,r14 + mulx r12,rbp,QWORD[((32+24))+rsp] + adcx r11,rbp + adcx r12,rax + + adox r10,r15 + adox r11,rbx + adox r12,rax + + mov rdx,QWORD[((0+24))+rsp] + mulx r13,r8,QWORD[((32+0))+rsp] + xor rax,rax + mulx r14,r15,QWORD[((32+8))+rsp] + adcx r13,r15 + adox r9,r8 + mulx r15,rbx,QWORD[((32+16))+rsp] + adcx r14,rbx + adox r10,r13 + mulx rbx,rbp,QWORD[((32+24))+rsp] + adcx r15,rbp + adcx rbx,rax + adox r11,r14 + adox r12,r15 + adox rbx,rax + mov QWORD[((0+24))+rsp],r9 + mov QWORD[((0+32))+rsp],r10 + mov QWORD[((0+40))+rsp],r11 + mov QWORD[((0+48))+rsp],r12 + mov QWORD[((0+56))+rsp],rbx + + + + mov rdx,QWORD[((0+0))+rdi] + mulx r8,r9,QWORD[((0+0))+rsi] + mov QWORD[((0+0))+rcx],r9 + mulx r9,r10,QWORD[((0+8))+rsi] + xor rax,rax + adox r8,r10 + mulx r10,r11,QWORD[((0+16))+rsi] + adox r9,r11 + mulx r11,r12,QWORD[((0+24))+rsi] + adox r10,r12 + + mov rdx,QWORD[((0+8))+rdi] + mulx r13,r12,QWORD[((0+0))+rsi] + adox r11,rax + xor rax,rax + mulx r14,r15,QWORD[((0+8))+rsi] + adox r12,r8 + mov QWORD[((0+8))+rcx],r12 + adcx r13,r15 + mulx r15,rbx,QWORD[((0+16))+rsi] + adcx r14,rbx + adox r13,r9 + mulx rbx,rbp,QWORD[((0+24))+rsi] + adcx r15,rbp + adcx rbx,rax + adox r14,r10 + + mov rdx,QWORD[((0+16))+rdi] + mulx r9,r8,QWORD[((0+0))+rsi] + adox r15,r11 + adox rbx,rax + xor rax,rax + mulx r10,r11,QWORD[((0+8))+rsi] + adox r8,r13 + mov QWORD[((0+16))+rcx],r8 + adcx r9,r11 + mulx r11,r12,QWORD[((0+16))+rsi] + adcx r10,r12 + adox r9,r14 + mulx r12,rbp,QWORD[((0+24))+rsi] + adcx r11,rbp + adcx r12,rax + + adox r10,r15 + adox r11,rbx + adox r12,rax + + mov rdx,QWORD[((0+24))+rdi] + mulx r13,r8,QWORD[((0+0))+rsi] + xor rax,rax + mulx r14,r15,QWORD[((0+8))+rsi] + adcx r13,r15 + adox r9,r8 + mulx r15,rbx,QWORD[((0+16))+rsi] + adcx r14,rbx + adox r10,r13 + mulx rbx,rbp,QWORD[((0+24))+rsi] + adcx r15,rbp + adcx rbx,rax + adox r11,r14 + adox r12,r15 + adox rbx,rax + mov QWORD[((0+24))+rcx],r9 + mov QWORD[((0+32))+rcx],r10 + mov QWORD[((0+40))+rcx],r11 + mov QWORD[((0+48))+rcx],r12 + mov QWORD[((0+56))+rcx],rbx + + + + mov rdx,QWORD[((32+0))+rdi] + mulx r8,r9,QWORD[((32+0))+rsi] + mov QWORD[((64+0))+rcx],r9 + mulx r9,r10,QWORD[((32+8))+rsi] + xor rax,rax + adox r8,r10 + mulx r10,r11,QWORD[((32+16))+rsi] + adox r9,r11 + + mov rdx,QWORD[((32+8))+rdi] + mulx r11,r12,QWORD[((32+0))+rsi] + adox r10,rax + xor rax,rax + + mulx r13,r14,QWORD[((32+8))+rsi] + adox r12,r8 + mov QWORD[((64+8))+rcx],r12 + adcx r11,r14 + + mulx r14,r8,QWORD[((32+16))+rsi] + adox r11,r9 + adcx r13,r8 + adcx r14,rax + adox r13,r10 + + mov rdx,QWORD[((32+16))+rdi] + mulx r9,r8,QWORD[((32+0))+rsi] + adox r14,rax + xor rax,rax + + mulx r12,r10,QWORD[((32+8))+rsi] + adox r8,r11 + mov QWORD[((64+16))+rcx],r8 + adcx r9,r13 + + mulx r8,r11,QWORD[((32+16))+rsi] + adcx r12,r14 + adcx r8,rax + adox r9,r10 + adox r11,r12 + adox r8,rax + mov QWORD[((64+24))+rcx],r9 + mov QWORD[((64+32))+rcx],r11 + mov QWORD[((64+40))+rcx],r8 + + + + + mov r8,QWORD[64+rsp] + mov r9,QWORD[72+rsp] + mov r10,QWORD[80+rsp] + mov r11,QWORD[88+rsp] + + mov rax,QWORD[32+rsp] + add r8,rax + mov rax,QWORD[40+rsp] + adc r9,rax + mov rax,QWORD[48+rsp] + adc r10,rax + mov rax,QWORD[56+rsp] + adc r11,rax + + + mov r12,QWORD[rsp] + mov r13,QWORD[8+rsp] + mov r14,QWORD[16+rsp] + mov r15,QWORD[24+rsp] + sub r12,QWORD[rcx] + sbb r13,QWORD[8+rcx] + sbb r14,QWORD[16+rcx] + sbb r15,QWORD[24+rcx] + sbb r8,QWORD[32+rcx] + sbb r9,QWORD[40+rcx] + sbb r10,QWORD[48+rcx] + sbb r11,QWORD[56+rcx] + + + sub r12,QWORD[64+rcx] + sbb r13,QWORD[72+rcx] + sbb r14,QWORD[80+rcx] + sbb r15,QWORD[88+rcx] + sbb r8,QWORD[96+rcx] + sbb r9,QWORD[104+rcx] + sbb r10,0x0 + sbb r11,0x0 + + add r12,QWORD[32+rcx] + mov QWORD[32+rcx],r12 + adc r13,QWORD[40+rcx] + mov QWORD[40+rcx],r13 + adc r14,QWORD[48+rcx] + mov QWORD[48+rcx],r14 + adc r15,QWORD[56+rcx] + mov QWORD[56+rcx],r15 + adc r8,QWORD[64+rcx] + mov QWORD[64+rcx],r8 + adc r9,QWORD[72+rcx] + mov QWORD[72+rcx],r9 + adc r10,QWORD[80+rcx] + mov QWORD[80+rcx],r10 + adc r11,QWORD[88+rcx] + mov QWORD[88+rcx],r11 + mov r12,QWORD[96+rcx] + adc r12,0x0 + mov QWORD[96+rcx],r12 + mov r13,QWORD[104+rcx] + adc r13,0x0 + mov QWORD[104+rcx],r13 + + add rsp,96 + + pop rbp + + + pop rbx + + + + + pop r15 + + + pop r14 + + + pop r13 + + + pop r12 + + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +global sike_mpmul + +sike_mpmul: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sike_mpmul: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push r12 + + + push r13 + + + push r14 + + + push r15 + + + + + + lea rcx,[OPENSSL_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$mul_bdw + + + + mov rcx,rdx + + sub rsp,112 + + + + xor rax,rax + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + xor r11,r11 + add r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + + sbb rax,0 + mov QWORD[64+rsp],rax + + mov QWORD[rcx],r8 + mov QWORD[8+rcx],r9 + mov QWORD[16+rcx],r10 + mov QWORD[24+rcx],r11 + + + xor rdx,rdx + mov r12,QWORD[32+rsi] + mov r13,QWORD[40+rsi] + mov r14,QWORD[48+rsi] + xor r15,r15 + add r12,QWORD[rsi] + adc r13,QWORD[8+rsi] + adc r14,QWORD[16+rsi] + adc r15,QWORD[24+rsi] + sbb rdx,0x0 + + mov QWORD[72+rsp],rdx + + + mov rax,QWORD[rcx] + mul r12 + mov QWORD[rsp],rax + mov r8,rdx + + xor r9,r9 + mov rax,QWORD[rcx] + mul r13 + add r8,rax + adc r9,rdx + + xor r10,r10 + mov rax,QWORD[8+rcx] + mul r12 + add r8,rax + mov QWORD[8+rsp],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[rcx] + mul r14 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[16+rcx] + mul r12 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[8+rcx] + mul r13 + add r9,rax + mov QWORD[16+rsp],r9 + adc r10,rdx + adc r8,0x0 + + xor r9,r9 + mov rax,QWORD[rcx] + mul r15 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[24+rcx] + mul r12 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[8+rcx] + mul r14 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[16+rcx] + mul r13 + add r10,rax + mov QWORD[24+rsp],r10 + adc r8,rdx + adc r9,0x0 + + xor r10,r10 + mov rax,QWORD[8+rcx] + mul r15 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[24+rcx] + mul r13 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[16+rcx] + mul r14 + add r8,rax + mov QWORD[32+rsp],r8 + adc r9,rdx + adc r10,0x0 + + xor r11,r11 + mov rax,QWORD[16+rcx] + mul r15 + add r9,rax + adc r10,rdx + adc r11,0x0 + + mov rax,QWORD[24+rcx] + mul r14 + add r9,rax + mov QWORD[40+rsp],r9 + adc r10,rdx + adc r11,0x0 + + mov rax,QWORD[24+rcx] + mul r15 + add r10,rax + mov QWORD[48+rsp],r10 + adc r11,rdx + mov QWORD[56+rsp],r11 + + + mov rax,QWORD[64+rsp] + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + + mov rax,QWORD[72+rsp] + mov r8,QWORD[rcx] + and r8,rax + mov r9,QWORD[8+rcx] + and r9,rax + mov r10,QWORD[16+rcx] + and r10,rax + mov r11,QWORD[24+rcx] + and r11,rax + + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,r11 + + + mov rax,QWORD[32+rsp] + add r12,rax + mov rax,QWORD[40+rsp] + adc r13,rax + mov rax,QWORD[48+rsp] + adc r14,rax + mov rax,QWORD[56+rsp] + adc r15,rax + mov QWORD[80+rsp],r12 + mov QWORD[88+rsp],r13 + mov QWORD[96+rsp],r14 + mov QWORD[104+rsp],r15 + + + mov r11,QWORD[rdi] + mov rax,QWORD[rsi] + mul r11 + xor r9,r9 + mov QWORD[rcx],rax + mov r8,rdx + + mov r14,QWORD[16+rdi] + mov rax,QWORD[8+rsi] + mul r11 + xor r10,r10 + add r8,rax + adc r9,rdx + + mov r12,QWORD[8+rdi] + mov rax,QWORD[rsi] + mul r12 + add r8,rax + mov QWORD[8+rcx],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[16+rsi] + mul r11 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov r13,QWORD[rsi] + mov rax,r14 + mul r13 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[8+rsi] + mul r12 + add r9,rax + mov QWORD[16+rcx],r9 + adc r10,rdx + adc r8,0x0 + + xor r9,r9 + mov rax,QWORD[24+rsi] + mul r11 + mov r15,QWORD[24+rdi] + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,r15 + mul r13 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[16+rsi] + mul r12 + add r10,rax + adc r8,rdx + adc r9,0x0 + + mov rax,QWORD[8+rsi] + mul r14 + add r10,rax + mov QWORD[24+rcx],r10 + adc r8,rdx + adc r9,0x0 + + xor r10,r10 + mov rax,QWORD[24+rsi] + mul r12 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[8+rsi] + mul r15 + add r8,rax + adc r9,rdx + adc r10,0x0 + + mov rax,QWORD[16+rsi] + mul r14 + add r8,rax + mov QWORD[32+rcx],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[24+rsi] + mul r14 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[16+rsi] + mul r15 + add r9,rax + mov QWORD[40+rcx],r9 + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[24+rsi] + mul r15 + add r10,rax + mov QWORD[48+rcx],r10 + adc r8,rdx + mov QWORD[56+rcx],r8 + + + + mov r11,QWORD[32+rdi] + mov rax,QWORD[32+rsi] + mul r11 + xor r9,r9 + mov QWORD[64+rcx],rax + mov r8,rdx + + mov r14,QWORD[48+rdi] + mov rax,QWORD[40+rsi] + mul r11 + xor r10,r10 + add r8,rax + adc r9,rdx + + mov r12,QWORD[40+rdi] + mov rax,QWORD[32+rsi] + mul r12 + add r8,rax + mov QWORD[72+rcx],r8 + adc r9,rdx + adc r10,0x0 + + xor r8,r8 + mov rax,QWORD[48+rsi] + mul r11 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov r13,QWORD[32+rsi] + mov rax,r14 + mul r13 + add r9,rax + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[40+rsi] + mul r12 + add r9,rax + mov QWORD[80+rcx],r9 + adc r10,rdx + adc r8,0x0 + + mov rax,QWORD[48+rsi] + mul r12 + xor r12,r12 + add r10,rax + adc r8,rdx + adc r12,0x0 + + mov rax,QWORD[40+rsi] + mul r14 + add r10,rax + adc r8,rdx + adc r12,0x0 + mov QWORD[88+rcx],r10 + + mov rax,QWORD[48+rsi] + mul r14 + add r8,rax + adc r12,0x0 + mov QWORD[96+rcx],r8 + + add rdx,r12 + + + mov r8,QWORD[rsp] + sub r8,QWORD[rcx] + mov r9,QWORD[8+rsp] + sbb r9,QWORD[8+rcx] + mov r10,QWORD[16+rsp] + sbb r10,QWORD[16+rcx] + mov r11,QWORD[24+rsp] + sbb r11,QWORD[24+rcx] + mov r12,QWORD[80+rsp] + sbb r12,QWORD[32+rcx] + mov r13,QWORD[88+rsp] + sbb r13,QWORD[40+rcx] + mov r14,QWORD[96+rsp] + sbb r14,QWORD[48+rcx] + mov r15,QWORD[104+rsp] + sbb r15,QWORD[56+rcx] + + + mov rax,QWORD[64+rcx] + sub r8,rax + mov rax,QWORD[72+rcx] + sbb r9,rax + mov rax,QWORD[80+rcx] + sbb r10,rax + mov rax,QWORD[88+rcx] + sbb r11,rax + mov rax,QWORD[96+rcx] + sbb r12,rax + sbb r13,rdx + sbb r14,0x0 + sbb r15,0x0 + + + add r8,QWORD[32+rcx] + mov QWORD[32+rcx],r8 + adc r9,QWORD[40+rcx] + mov QWORD[40+rcx],r9 + adc r10,QWORD[48+rcx] + mov QWORD[48+rcx],r10 + adc r11,QWORD[56+rcx] + mov QWORD[56+rcx],r11 + adc r12,QWORD[64+rcx] + mov QWORD[64+rcx],r12 + adc r13,QWORD[72+rcx] + mov QWORD[72+rcx],r13 + adc r14,QWORD[80+rcx] + mov QWORD[80+rcx],r14 + adc r15,QWORD[88+rcx] + mov QWORD[88+rcx],r15 + mov r12,QWORD[96+rcx] + adc r12,0x0 + mov QWORD[96+rcx],r12 + adc rdx,0x0 + mov QWORD[104+rcx],rdx + + add rsp,112 + + + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +