Version in base suite: 2.41-12

Base version: glibc_2.41-12
Target version: glibc_2.41-12+deb13u1
Base file: /srv/ftp-master.debian.org/ftp/pool/main/g/glibc/glibc_2.41-12.dsc
Target file: /srv/ftp-master.debian.org/policy/pool/main/g/glibc/glibc_2.41-12+deb13u1.dsc

 changelog                |   16 
 patches/git-updates.diff | 3670 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 3602 insertions(+), 84 deletions(-)

diff -Nru glibc-2.41/debian/changelog glibc-2.41/debian/changelog
--- glibc-2.41/debian/changelog	2025-08-05 17:34:49.000000000 +0000
+++ glibc-2.41/debian/changelog	2025-12-28 16:32:33.000000000 +0000
@@ -1,3 +1,19 @@
+glibc (2.41-12+deb13u1) trixie; urgency=medium
+
+  * debian/patches/git-updates.diff: update from upstream stable branch:
+    - Fix a double lock init issue after fork()
+    - Fix _dl_find_object when ld.so has LOAD segment gaps, causing wrong
+      backtrace unwinding. This affects at least arm64.
+    - Fix SYSCALL_CANCEL for return values larger than INT_MAX (closes:
+      #1115729).
+    - Fix crash in ifunc functions on arm64 when hardening with
+      -ftrivial-auto-var-init=zero.
+    - Optimize inverse trig functions on arm64.
+    - Optimize arm64 SVE exp, hyperbolic, and log1p functions.
+    - Optimize arm64 SVE expf and log1p helpers.
+
+ -- Aurelien Jarno <aurel32@debian.org>  Sun, 28 Dec 2025 17:32:33 +0100
+
 glibc (2.41-12) unstable; urgency=medium
 
   [ Helmut Grohne ]
diff -Nru glibc-2.41/debian/patches/git-updates.diff glibc-2.41/debian/patches/git-updates.diff
--- glibc-2.41/debian/patches/git-updates.diff	2025-08-05 16:54:19.000000000 +0000
+++ glibc-2.41/debian/patches/git-updates.diff	2025-12-28 16:32:28.000000000 +0000
@@ -22,10 +22,10 @@
  $(common-objdir):$(subst $(empty) ,:,$(patsubst ../$(subdir),.,$(rpath-dirs:%=$(common-objpfx)%)))
  else  # build-static
 diff --git a/NEWS b/NEWS
-index b11422b060..89d0935beb 100644
+index b11422b060..f77d1471c3 100644
 --- a/NEWS
 +++ b/NEWS
-@@ -5,6 +5,36 @@ See the end for copying conditions.
+@@ -5,6 +5,39 @@ See the end for copying conditions.
  Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
  using `glibc' in the "product" field.
  
@@ -39,6 +39,7 @@
 +
 +The following bugs were resolved with this release:
 +
++  [31943] _dl_find_object can fail if ld.so contains gaps between load segments
 +  [32269] RISC-V IFUNC resolver cannot access gp pointer
 +  [32626] math: math: log10p1f is not correctly rounded
 +  [32627] math: math: sinhf is not correctly rounded
@@ -56,8 +57,10 @@
 +  [32981] ports: elf/tst-execstack-prog-static-tunable fails on
 +    sparc64-linux-gnu
 +  [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
++  [32994] stdlib: resolve a double lock init issue after fork
 +  [33164] iconv -o should not create executable files
 +  [33185] Fix double-free after allocation failure in regcomp
++  [33245] nptl: nptl: error in internal cancellation syscall handling
 +
  Version 2.41
  
@@ -909,14 +912,23 @@
  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#ifdef PIE_UNSUPPORTED
  # error PIE is not supported
 diff --git a/elf/Makefile b/elf/Makefile
-index 4b1d0d8741..f42cb154fb 100644
+index 4b1d0d8741..b8064ef14c 100644
 --- a/elf/Makefile
 +++ b/elf/Makefile
-@@ -61,6 +61,7 @@ dl-routines = \
+@@ -34,7 +34,6 @@ routines = \
+   dl-addr \
+   dl-addr-obj \
+   dl-early_allocate \
+-  dl-find_object \
+   dl-iteratephdr \
+   dl-libc \
+   dl-origin \
+@@ -61,6 +60,8 @@ dl-routines = \
    dl-deps \
    dl-exception \
    dl-execstack \
 +  dl-execstack-tunable \
++  dl-find_object \
    dl-fini \
    dl-init \
    dl-load \
@@ -936,7 +948,27 @@
    tst-audit1 \
    tst-audit2 \
    tst-audit8 \
-@@ -567,9 +570,11 @@ tests-execstack-yes = \
+@@ -532,6 +535,8 @@ tests-internal += \
+   tst-dl_find_object-threads \
+   tst-dlmopen2 \
+   tst-hash-collision3 \
++  tst-link-map-contiguous-ldso \
++  tst-link-map-contiguous-libc \
+   tst-ptrguard1 \
+   tst-stackguard1 \
+   tst-tls-surplus \
+@@ -543,6 +548,10 @@ tests-internal += \
+   unload2 \
+   # tests-internal
+ 
++ifeq ($(build-hardcoded-path-in-tests),yes)
++tests-internal += tst-link-map-contiguous-main
++endif
++
+ tests-container += \
+   tst-dlopen-self-container \
+   tst-dlopen-tlsmodid-container \
+@@ -567,9 +576,11 @@ tests-execstack-yes = \
    tst-execstack \
    tst-execstack-needed \
    tst-execstack-prog \
@@ -949,7 +981,7 @@
    # tests-execstack-static-yes
  ifeq (yes,$(run-built-tests))
  tests-execstack-special-yes = \
-@@ -863,6 +868,7 @@ modules-names += \
+@@ -863,6 +874,7 @@ modules-names += \
    tst-auditmanymod8 \
    tst-auditmanymod9 \
    tst-auditmod-tlsdesc  \
@@ -957,7 +989,7 @@
    tst-auditmod1 \
    tst-auditmod11 \
    tst-auditmod12 \
-@@ -905,6 +911,7 @@ modules-names += \
+@@ -905,6 +917,7 @@ modules-names += \
    tst-dlmopen1mod \
    tst-dlopen-auditdup-auditmod \
    tst-dlopen-auditdupmod \
@@ -965,7 +997,7 @@
    tst-dlopen-tlsreinitmod1 \
    tst-dlopen-tlsreinitmod2 \
    tst-dlopen-tlsreinitmod3 \
-@@ -1144,6 +1151,10 @@ tests-pie += \
+@@ -1144,6 +1157,10 @@ tests-pie += \
    tst-pie1 \
    tst-pie2 \
    # tests-pie
@@ -976,7 +1008,7 @@
  ifneq (,$(load-address-ldflag))
  tests += \
    tst-pie-address \
-@@ -1159,6 +1170,10 @@ tests += \
+@@ -1159,6 +1176,10 @@ tests += \
  tests-static += \
    tst-pie-address-static \
    # tests-static
@@ -987,7 +1019,7 @@
  LDFLAGS-tst-pie-address-static += \
    $(load-address-ldflag)=$(pde-load-address)
  endif
-@@ -1988,6 +2003,9 @@ $(objpfx)tst-execstack.out: $(objpfx)tst-execstack-mod.so
+@@ -1988,6 +2009,9 @@ $(objpfx)tst-execstack.out: $(objpfx)tst-execstack-mod.so
  CPPFLAGS-tst-execstack.c += -DUSE_PTHREADS=0
  LDFLAGS-tst-execstack = -Wl,-z,noexecstack
  LDFLAGS-tst-execstack-mod.so = -Wl,-z,execstack
@@ -997,7 +1029,7 @@
  
  $(objpfx)tst-execstack-needed: $(objpfx)tst-execstack-mod.so
  LDFLAGS-tst-execstack-needed = -Wl,-z,noexecstack
-@@ -1996,7 +2014,18 @@ LDFLAGS-tst-execstack-prog = -Wl,-z,execstack
+@@ -1996,7 +2020,18 @@ LDFLAGS-tst-execstack-prog = -Wl,-z,execstack
  CFLAGS-tst-execstack-prog.c += -Wno-trampolines
  CFLAGS-tst-execstack-mod.c += -Wno-trampolines
  
@@ -1016,7 +1048,7 @@
  CFLAGS-tst-execstack-prog-static.c += -Wno-trampolines
  
  ifeq (yes,$(build-hardcoded-path-in-tests))
-@@ -2074,6 +2103,7 @@ $(objpfx)tst-array5-static-cmp.out: tst-array5-static.exp \
+@@ -2074,6 +2109,7 @@ $(objpfx)tst-array5-static-cmp.out: tst-array5-static.exp \
  
  CFLAGS-tst-pie1.c += $(pie-ccflag)
  CFLAGS-tst-pie2.c += $(pie-ccflag)
@@ -1024,7 +1056,7 @@
  CFLAGS-tst-pie-address.c += $(pie-ccflag)
  
  $(objpfx)tst-piemod1.so: $(libsupport)
-@@ -3189,6 +3219,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
+@@ -3189,6 +3225,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
  tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
  $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
  tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
@@ -1034,7 +1066,7 @@
  
  $(objpfx)tst-dlmopen-twice.out: \
    $(objpfx)tst-dlmopen-twice-mod1.so \
-@@ -3392,3 +3425,5 @@ $(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o
+@@ -3392,3 +3431,5 @@ $(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o
  	  -Wl,--dynamic-linker=$(objpfx)ld.so
  $(objpfx)tst-nolink-libc-2.out: $(objpfx)tst-nolink-libc-2 $(objpfx)ld.so
  	$< > $@ 2>&1; $(evaluate-test)
@@ -1098,6 +1130,146 @@
  {
    return ENOSYS;
  }
+diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c
+index 513e464011..c9f4c1c8d1 100644
+--- a/elf/dl-find_object.c
++++ b/elf/dl-find_object.c
+@@ -356,7 +356,7 @@ _dlfo_lookup (uintptr_t pc, struct dl_find_object_internal *first1, size_t size)
+ }
+ 
+ int
+-__dl_find_object (void *pc1, struct dl_find_object *result)
++_dl_find_object (void *pc1, struct dl_find_object *result)
+ {
+   uintptr_t pc = (uintptr_t) pc1;
+ 
+@@ -463,8 +463,38 @@ __dl_find_object (void *pc1, struct dl_find_object *result)
+         return -1;
+     } /* Transaction retry loop.  */
+ }
+-hidden_def (__dl_find_object)
+-weak_alias (__dl_find_object, _dl_find_object)
++rtld_hidden_def (_dl_find_object)
++
++/* Subroutine of _dlfo_process_initial to split out noncontigous link
++   maps.  NODELETE is the number of used _dlfo_nodelete_mappings
++   elements.  It is incremented as needed, and the new NODELETE value
++   is returned.  */
++static size_t
++_dlfo_process_initial_noncontiguous_map (struct link_map *map,
++                                         size_t nodelete)
++{
++  struct dl_find_object_internal dlfo;
++  _dl_find_object_from_map (map, &dlfo);
++
++  /* PT_LOAD segments for a non-contiguous link map are added to the
++     non-closeable mappings.  */
++  const ElfW(Phdr) *ph = map->l_phdr;
++  const ElfW(Phdr) *ph_end = map->l_phdr + map->l_phnum;
++  for (; ph < ph_end; ++ph)
++    if (ph->p_type == PT_LOAD)
++      {
++        if (_dlfo_nodelete_mappings != NULL)
++          {
++            /* Second pass only.  */
++            _dlfo_nodelete_mappings[nodelete] = dlfo;
++            ElfW(Addr) start = ph->p_vaddr + map->l_addr;
++            _dlfo_nodelete_mappings[nodelete].map_start = start;
++            _dlfo_nodelete_mappings[nodelete].map_end = start + ph->p_memsz;
++          }
++        ++nodelete;
++      }
++  return nodelete;
++}
+ 
+ /* _dlfo_process_initial is called twice.  First to compute the array
+    sizes from the initial loaded mappings.  Second to fill in the
+@@ -477,29 +507,8 @@ _dlfo_process_initial (void)
+ 
+   size_t nodelete = 0;
+   if (!main_map->l_contiguous)
+-    {
+-      struct dl_find_object_internal dlfo;
+-      _dl_find_object_from_map (main_map, &dlfo);
+-
+-      /* PT_LOAD segments for a non-contiguous are added to the
+-         non-closeable mappings.  */
+-      for (const ElfW(Phdr) *ph = main_map->l_phdr,
+-             *ph_end = main_map->l_phdr + main_map->l_phnum;
+-           ph < ph_end; ++ph)
+-        if (ph->p_type == PT_LOAD)
+-          {
+-            if (_dlfo_nodelete_mappings != NULL)
+-              {
+-                /* Second pass only.  */
+-                _dlfo_nodelete_mappings[nodelete] = dlfo;
+-                _dlfo_nodelete_mappings[nodelete].map_start
+-                  = ph->p_vaddr + main_map->l_addr;
+-                _dlfo_nodelete_mappings[nodelete].map_end
+-                  = _dlfo_nodelete_mappings[nodelete].map_start + ph->p_memsz;
+-              }
+-            ++nodelete;
+-          }
+-    }
++    /* Contiguous case already handled in _dl_find_object_init.  */
++    nodelete = _dlfo_process_initial_noncontiguous_map (main_map, nodelete);
+ 
+   size_t loaded = 0;
+   for (Lmid_t ns = 0; ns < GL(dl_nns); ++ns)
+@@ -511,11 +520,18 @@ _dlfo_process_initial (void)
+           /* lt_library link maps are implicitly NODELETE.  */
+           if (l->l_type == lt_library || l->l_nodelete_active)
+             {
+-              if (_dlfo_nodelete_mappings != NULL)
+-                /* Second pass only.  */
+-                _dl_find_object_from_map
+-                  (l, _dlfo_nodelete_mappings + nodelete);
+-              ++nodelete;
++              /* The kernel may have loaded ld.so with gaps.   */
++              if (!l->l_contiguous && is_rtld_link_map (l))
++                nodelete
++                  = _dlfo_process_initial_noncontiguous_map (l, nodelete);
++              else
++                {
++                  if (_dlfo_nodelete_mappings != NULL)
++                    /* Second pass only.  */
++                    _dl_find_object_from_map
++                      (l, _dlfo_nodelete_mappings + nodelete);
++                  ++nodelete;
++                }
+             }
+           else if (l->l_type == lt_loaded)
+             {
+@@ -765,7 +781,6 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count)
+           /* Prefer newly loaded link map.  */
+           assert (loaded_index1 > 0);
+           _dl_find_object_from_map (loaded[loaded_index1 - 1], dlfo);
+-          loaded[loaded_index1 -  1]->l_find_object_processed = 1;
+           --loaded_index1;
+         }
+ 
+diff --git a/elf/dl-find_object.h b/elf/dl-find_object.h
+index e433ff8740..563af3de1d 100644
+--- a/elf/dl-find_object.h
++++ b/elf/dl-find_object.h
+@@ -87,7 +87,7 @@ _dl_find_object_to_external (struct dl_find_object_internal *internal,
+ }
+ 
+ /* Extract the object location data from a link map and writes it to
+-   *RESULT using relaxed MO stores.  */
++   *RESULT using relaxed MO stores.  Set L->l_find_object_processed.  */
+ static void __attribute__ ((unused))
+ _dl_find_object_from_map (struct link_map *l,
+                           struct dl_find_object_internal *result)
+@@ -100,6 +100,8 @@ _dl_find_object_from_map (struct link_map *l,
+   atomic_store_relaxed (&result->eh_dbase, (void *) l->l_info[DT_PLTGOT]);
+ #endif
+ 
++  l->l_find_object_processed = 1;
++
+   for (const ElfW(Phdr) *ph = l->l_phdr, *ph_end = l->l_phdr + l->l_phnum;
+        ph < ph_end; ++ph)
+     if (ph->p_type == DLFO_EH_SEGMENT_TYPE)
 diff --git a/elf/dl-load.c b/elf/dl-load.c
 index f905578a65..945dd8a231 100644
 --- a/elf/dl-load.c
@@ -1181,10 +1353,71 @@
      }
    }
 diff --git a/elf/rtld.c b/elf/rtld.c
-index 00bec15316..7a8aa56377 100644
+index 00bec15316..c1e9721def 100644
 --- a/elf/rtld.c
 +++ b/elf/rtld.c
-@@ -1626,9 +1626,9 @@ dl_main (const ElfW(Phdr) *phdr,
+@@ -1242,6 +1242,60 @@ rtld_setup_main_map (struct link_map *main_map)
+   return has_interp;
+ }
+ 
++/* Set up the program header information for the dynamic linker
++   itself.  It can be accessed via _r_debug and dl_iterate_phdr
++   callbacks, and it is used by _dl_find_object.  */
++static void
++rtld_setup_phdr (void)
++{
++  /* Starting from binutils-2.23, the linker will define the magic
++     symbol __ehdr_start to point to our own ELF header if it is
++     visible in a segment that also includes the phdrs.  */
++
++  const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start;
++  assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr);
++  assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr)));
++
++  const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff;
++
++  _dl_rtld_map.l_phdr = rtld_phdr;
++  _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum;
++
++
++  _dl_rtld_map.l_contiguous = 1;
++  /* The linker may not have produced a contiguous object.  The kernel
++     will load the object with actual gaps (unlike the glibc loader
++     for shared objects, which always produces a contiguous mapping).
++     See similar logic in rtld_setup_main_map above.  */
++  {
++    ElfW(Addr) expected_load_address = 0;
++    for (const ElfW(Phdr) *ph = rtld_phdr; ph < &rtld_phdr[rtld_ehdr->e_phnum];
++	 ++ph)
++      if (ph->p_type == PT_LOAD)
++	{
++	  ElfW(Addr) mapstart = ph->p_vaddr & ~(GLRO(dl_pagesize) - 1);
++	  if (_dl_rtld_map.l_contiguous && expected_load_address != 0
++	      && expected_load_address != mapstart)
++	    _dl_rtld_map.l_contiguous = 0;
++	  ElfW(Addr) allocend = ph->p_vaddr + ph->p_memsz;
++	  /* The next expected address is the page following this load
++	     segment.  */
++	  expected_load_address = ((allocend + GLRO(dl_pagesize) - 1)
++				   & ~(GLRO(dl_pagesize) - 1));
++	}
++  }
++
++  /* PT_GNU_RELRO is usually the last phdr.  */
++  size_t cnt = rtld_ehdr->e_phnum;
++  while (cnt-- > 0)
++    if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
++      {
++	_dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr;
++	_dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz;
++	break;
++      }
++}
++
+ /* Adjusts the contents of the stack and related globals for the user
+    entry point.  The ld.so processed skip_args arguments and bumped
+    _dl_argv and _dl_argc accordingly.  Those arguments are removed from
+@@ -1626,9 +1680,9 @@ dl_main (const ElfW(Phdr) *phdr,
  
    bool has_interp = rtld_setup_main_map (main_map);
  
@@ -1197,6 +1430,41 @@
  
    /* If the current libname is different from the SONAME, add the
       latter as well.  */
+@@ -1710,33 +1764,7 @@ dl_main (const ElfW(Phdr) *phdr,
+   ++GL(dl_ns)[LM_ID_BASE]._ns_nloaded;
+   ++GL(dl_load_adds);
+ 
+-  /* Starting from binutils-2.23, the linker will define the magic symbol
+-     __ehdr_start to point to our own ELF header if it is visible in a
+-     segment that also includes the phdrs.  If that's not available, we use
+-     the old method that assumes the beginning of the file is part of the
+-     lowest-addressed PT_LOAD segment.  */
+-
+-  /* Set up the program header information for the dynamic linker
+-     itself.  It is needed in the dl_iterate_phdr callbacks.  */
+-  const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start;
+-  assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr);
+-  assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr)));
+-
+-  const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff;
+-
+-  _dl_rtld_map.l_phdr = rtld_phdr;
+-  _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum;
+-
+-
+-  /* PT_GNU_RELRO is usually the last phdr.  */
+-  size_t cnt = rtld_ehdr->e_phnum;
+-  while (cnt-- > 0)
+-    if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
+-      {
+-	_dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr;
+-	_dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz;
+-	break;
+-      }
++  rtld_setup_phdr ();
+ 
+   /* Add the dynamic linker to the TLS list if it also uses TLS.  */
+   if (_dl_rtld_map.l_tls_blocksize != 0)
 diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
 new file mode 100644
 index 0000000000..7ba2c4129a
@@ -1518,6 +1786,224 @@
 +++ b/elf/tst-execstack-tunable.c
 @@ -0,0 +1 @@
 +#include <tst-execstack.c>
+diff --git a/elf/tst-link-map-contiguous-ldso.c b/elf/tst-link-map-contiguous-ldso.c
+new file mode 100644
+index 0000000000..04de808bb2
+--- /dev/null
++++ b/elf/tst-link-map-contiguous-ldso.c
+@@ -0,0 +1,98 @@
++/* Check that _dl_find_object behavior matches up with gaps.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <dlfcn.h>
++#include <gnu/lib-names.h>
++#include <link.h>
++#include <stdbool.h>
++#include <stdio.h>
++#include <support/check.h>
++#include <support/xdlfcn.h>
++#include <support/xunistd.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++static int
++do_test (void)
++{
++  struct link_map *l = xdlopen (LD_SO, RTLD_NOW);
++  if (!l->l_contiguous)
++    {
++      puts ("info: ld.so link map is not contiguous");
++
++      /* Try to find holes by probing with mmap.  */
++      int pagesize = getpagesize ();
++      bool gap_found = false;
++      ElfW(Addr) addr = l->l_map_start;
++      TEST_COMPARE (addr % pagesize, 0);
++      while (addr < l->l_map_end)
++        {
++          void *expected = (void *) addr;
++          void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE,
++                             MAP_PRIVATE | MAP_ANONYMOUS, -1);
++          struct dl_find_object dlfo;
++          int dlfo_ret = _dl_find_object (expected, &dlfo);
++          if (ptr == expected)
++            {
++              if (dlfo_ret < 0)
++                {
++                  TEST_COMPARE (dlfo_ret, -1);
++                  printf ("info: hole without mapping data found at %p\n", ptr);
++                }
++              else
++                FAIL ("object \"%s\" found in gap at %p",
++                      dlfo.dlfo_link_map->l_name, ptr);
++              gap_found = true;
++            }
++          else if (dlfo_ret == 0)
++            {
++              if ((void *) dlfo.dlfo_link_map != (void *) l)
++                {
++                  printf ("info: object \"%s\" found at %p\n",
++                          dlfo.dlfo_link_map->l_name, ptr);
++                  gap_found = true;
++                }
++            }
++          else
++            TEST_COMPARE (dlfo_ret, -1);
++          xmunmap (ptr, 1);
++          addr += pagesize;
++        }
++      if (!gap_found)
++        FAIL ("no ld.so gap found");
++    }
++  else
++    {
++      puts ("info: ld.so link map is contiguous");
++
++      /* Assert that ld.so is truly contiguous in memory.  */
++      volatile long int *p = (volatile long int *) l->l_map_start;
++      volatile long int *end = (volatile long int *) l->l_map_end;
++      while (p < end)
++        {
++          *p;
++          ++p;
++        }
++    }
++
++  xdlclose (l);
++
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/elf/tst-link-map-contiguous-libc.c b/elf/tst-link-map-contiguous-libc.c
+new file mode 100644
+index 0000000000..eb5728c765
+--- /dev/null
++++ b/elf/tst-link-map-contiguous-libc.c
+@@ -0,0 +1,57 @@
++/* Check that the entire libc.so program image is readable if contiguous.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <gnu/lib-names.h>
++#include <link.h>
++#include <support/check.h>
++#include <support/xdlfcn.h>
++#include <support/xunistd.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++static int
++do_test (void)
++{
++  struct link_map *l = xdlopen (LIBC_SO, RTLD_NOW);
++
++  /* The dynamic loader fills holes with PROT_NONE mappings.  */
++  if (!l->l_contiguous)
++    FAIL_EXIT1 ("libc.so link map is not contiguous");
++
++  /* Direct probing does not work because not everything is readable
++     due to PROT_NONE mappings.  */
++  int pagesize = getpagesize ();
++  ElfW(Addr) addr = l->l_map_start;
++  TEST_COMPARE (addr % pagesize, 0);
++  while (addr < l->l_map_end)
++    {
++      void *expected = (void *) addr;
++      void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE,
++                         MAP_PRIVATE | MAP_ANONYMOUS, -1);
++      if (ptr == expected)
++        FAIL ("hole in libc.so memory image after %lu bytes",
++              (unsigned long int) (addr - l->l_map_start));
++      xmunmap (ptr, 1);
++      addr += pagesize;
++    }
++
++  xdlclose (l);
++
++  return 0;
++}
++#include <support/test-driver.c>
+diff --git a/elf/tst-link-map-contiguous-main.c b/elf/tst-link-map-contiguous-main.c
+new file mode 100644
+index 0000000000..2d1a054f0f
+--- /dev/null
++++ b/elf/tst-link-map-contiguous-main.c
+@@ -0,0 +1,45 @@
++/* Check that the entire main program image is readable if contiguous.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <link.h>
++#include <support/check.h>
++#include <support/xdlfcn.h>
++
++static int
++do_test (void)
++{
++  struct link_map *l = xdlopen ("", RTLD_NOW);
++  if (!l->l_contiguous)
++    FAIL_UNSUPPORTED ("main link map is not contiguous");
++
++  /* This check only works if the kernel loaded the main program.  The
++     dynamic loader replaces gaps with PROT_NONE mappings, resulting
++     in faults.  */
++  volatile long int *p = (volatile long int *) l->l_map_start;
++  volatile long int *end = (volatile long int *) l->l_map_end;
++  while (p < end)
++    {
++      *p;
++      ++p;
++    }
++
++  xdlclose (l);
++
++  return 0;
++}
++#include <support/test-driver.c>
 diff --git a/elf/tst-pie-bss-static.c b/elf/tst-pie-bss-static.c
 new file mode 100644
 index 0000000000..5df542f9ee
@@ -1628,6 +2114,20 @@
      if ! cmp -s "$tmp/out" "$tmp/expected" ; then
          echo "error: iconv output difference" >&$logfd
          echo "*** expected ***" >&$logfd
+diff --git a/include/dlfcn.h b/include/dlfcn.h
+index f49ee1b0c9..a44420fa37 100644
+--- a/include/dlfcn.h
++++ b/include/dlfcn.h
+@@ -4,8 +4,7 @@
+ #include <link.h>		/* For ElfW.  */
+ #include <stdbool.h>
+ 
+-extern __typeof (_dl_find_object) __dl_find_object;
+-hidden_proto (__dl_find_object)
++rtld_hidden_proto (_dl_find_object)
+ 
+ /* Internally used flag.  */
+ #define __RTLD_DLOPEN	0x80000000
 diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
 index 01ba689aa8..4f194da19d 100644
 --- a/math/auto-libm-test-in
@@ -1792,6 +2292,21 @@
  
  tst-stackguard1-ARGS = --command "$(host-test-program-cmd) --child"
  tst-stackguard1-static-ARGS = --command "$(objpfx)tst-stackguard1-static --child"
+diff --git a/nptl/cancellation.c b/nptl/cancellation.c
+index 156e63dcf0..bed0383a23 100644
+--- a/nptl/cancellation.c
++++ b/nptl/cancellation.c
+@@ -72,8 +72,8 @@ __syscall_cancel (__syscall_arg_t a1, __syscall_arg_t a2,
+ 		  __syscall_arg_t a5, __syscall_arg_t a6,
+ 		  __SYSCALL_CANCEL7_ARG_DEF __syscall_arg_t nr)
+ {
+-  int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6,
+-				     __SYSCALL_CANCEL7_ARG nr);
++  long int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6,
++					  __SYSCALL_CANCEL7_ARG nr);
+   return __glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (r))
+ 	 ? SYSCALL_ERROR_LABEL (INTERNAL_SYSCALL_ERRNO (r))
+ 	 : r;
 diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
 index f7ce3ec51b..b838273881 100644
 --- a/nptl/pthread_cancel.c
@@ -2091,6 +2606,30 @@
    tst-secure-getenv \
    # tests-static
  
+diff --git a/stdlib/abort.c b/stdlib/abort.c
+index caa9e6dc04..904244a2fb 100644
+--- a/stdlib/abort.c
++++ b/stdlib/abort.c
+@@ -19,6 +19,7 @@
+ #include <internal-signals.h>
+ #include <libc-lock.h>
+ #include <pthreadP.h>
++#include <string.h>
+ #include <unistd.h>
+ 
+ /* Try to get a machine dependent instruction which will make the
+@@ -42,7 +43,10 @@ __libc_rwlock_define_initialized (static, lock);
+ void
+ __abort_fork_reset_child (void)
+ {
+-  __libc_rwlock_init (lock);
++  /* Reinitialize lock without calling pthread_rwlock_init, to
++     avoid a valgrind DRD false positive.  */
++  __libc_rwlock_define_initialized (, reset_lock);
++  memcpy (&lock, &reset_lock, sizeof (lock));
+ }
+ 
+ void
 diff --git a/stdlib/getenv.c b/stdlib/getenv.c
 index 5e7212cca6..1a7b0bfc06 100644
 --- a/stdlib/getenv.c
@@ -2424,6 +2963,521 @@
  }
  
  void
+diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
+index 7709b5454f..453f780314 100644
+--- a/sysdeps/aarch64/fpu/acos_advsimd.c
++++ b/sysdeps/aarch64/fpu/acos_advsimd.c
+@@ -18,24 +18,23 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+ 
+ static const struct data
+ {
+-  float64x2_t poly[12];
+-  float64x2_t pi, pi_over_2;
++  double c1, c3, c5, c7, c9, c11;
++  float64x2_t c0, c2, c4, c6, c8, c10;
+   uint64x2_t abs_mask;
++  float64x2_t pi, pi_over_2;
+ } data = {
+   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+-  .pi = V2 (0x1.921fb54442d18p+1),
+-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
++  .c0 = V2 (0x1.555555555554ep-3),     .c1 = 0x1.3333333337233p-4,
++  .c2 = V2 (0x1.6db6db67f6d9fp-5),     .c3 = 0x1.f1c71fbd29fbbp-6,
++  .c4 = V2 (0x1.6e8b264d467d6p-6),     .c5 = 0x1.1c5997c357e9dp-6,
++  .c6 = V2 (0x1.c86a22cd9389dp-7),     .c7 = 0x1.856073c22ebbep-7,
++  .c8 = V2 (0x1.fd1151acb6bedp-8),     .c9 = 0x1.087182f799c1dp-6,
++  .c10 = V2 (-0x1.6602748120927p-7),   .c11 = 0x1.cfa0dd1f9478p-6,
++  .pi = V2 (0x1.921fb54442d18p+1),     .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+   .abs_mask = V2 (0x7fffffffffffffff),
+ };
+ 
+@@ -63,7 +62,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+ 
+      acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+ 
+-   The largest observed error in this region is 1.18 ulps,
++   The largest observed error in this region is 1.18 ulp:
+    _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
+ 				       want 0x1.0d54d1985c069p+0.
+ 
+@@ -71,9 +70,9 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+ 
+      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+ 
+-   The largest observed error in this region is 1.52 ulps,
+-   _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
+-				       want 0x1.edbbedf8a7d6cp-1.  */
++   The largest observed error in this region is 1.50 ulp:
++   _ZGVnN2v_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1
++				       want 0x1.ec1a46aa829p-1.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -99,13 +98,32 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
+   float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
+ 
+   /* Use a single polynomial approximation P for both intervals.  */
++  float64x2_t z3 = vmulq_f64 (z2, z);
+   float64x2_t z4 = vmulq_f64 (z2, z2);
+   float64x2_t z8 = vmulq_f64 (z4, z4);
+-  float64x2_t z16 = vmulq_f64 (z8, z8);
+-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+ 
+-  /* Finalize polynomial: z + z * z2 * P(z2).  */
+-  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
++  /* Order-11 Estrin.  */
++  float64x2_t c13 = vld1q_f64 (&d->c1);
++  float64x2_t c57 = vld1q_f64 (&d->c5);
++  float64x2_t c911 = vld1q_f64 (&d->c9);
++
++  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
++  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
++  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
++
++  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
++  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
++  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
++
++  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
++  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
++  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
++
++  float64x2_t p411 = vfmaq_f64 (p47, z8, p811);
++  float64x2_t p = vfmaq_f64 (p03, z8, p411);
++
++  /* Finalize polynomial: z + z3 * P(z2).  */
++  p = vfmaq_f64 (z, z3, p);
+ 
+   /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
+ 	       = 2 Q(|x|)               , for  0.5 < x < 1.0
+diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c
+index 74e2f7df0f..104f0d7805 100644
+--- a/sysdeps/aarch64/fpu/acos_sve.c
++++ b/sysdeps/aarch64/fpu/acos_sve.c
+@@ -18,20 +18,21 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[12];
+-  float64_t pi, pi_over_2;
++  float64_t c1, c3, c5, c7, c9, c11;
++  float64_t c0, c2, c4, c6, c8, c10;
++  float64_t pi_over_2;
+ } data = {
+   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+-  .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
+-	    0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+-	    0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
+-	    0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
+-  .pi = 0x1.921fb54442d18p+1,
++  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
++  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
++  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
++  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
++  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
++  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
+   .pi_over_2 = 0x1.921fb54442d18p+0,
+ };
+ 
+@@ -42,20 +43,21 @@ static const struct data
+ 
+      acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+ 
+-   The largest observed error in this region is 1.18 ulps,
+-   _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0
+-				       want 0x1.0d4d0f55667f7p+0.
++   The largest observed error in this region is 1.18 ulp:
++   _ZGVsMxv_acos (0x1.fbb7c9079b429p-2) got 0x1.0d51266607582p+0
++				       want 0x1.0d51266607583p+0.
+ 
+    For |x| in [0.5, 1.0], use same approximation with a change of variable
+ 
+      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+ 
+-   The largest observed error in this region is 1.52 ulps,
+-   _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1
+-				       want 0x1.ed82df4243f0bp-1.  */
++   The largest observed error in this region is 1.50 ulp:
++   _ZGVsMxv_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1
++				       want 0x1.ec1a46aa829p-1.  */
+ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
++  svbool_t ptrue = svptrue_b64 ();
+ 
+   svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+   svfloat64_t ax = svabs_x (pg, x);
+@@ -70,24 +72,41 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
+ 
+   /* Use a single polynomial approximation P for both intervals.  */
+-  svfloat64_t z4 = svmul_x (pg, z2, z2);
+-  svfloat64_t z8 = svmul_x (pg, z4, z4);
+-  svfloat64_t z16 = svmul_x (pg, z8, z8);
+-  svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
++  svfloat64_t z3 = svmul_x (ptrue, z2, z);
++  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
++  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
++
++  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
++  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
++  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
++
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
++  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
++
++  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
++  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
++
++  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
++  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
++  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
++
++  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
++  svfloat64_t p = svmad_x (pg, p411, z8, p03);
+ 
+   /* Finalize polynomial: z + z * z2 * P(z2).  */
+-  p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
++  p = svmad_x (pg, p, z3, z);
+ 
+   /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
+ 	       = 2 Q(|x|)               , for  0.5 < x < 1.0
+ 	       = pi - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+-  svfloat64_t y
+-      = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign));
+-
+-  svbool_t is_neg = svcmplt (pg, x, 0.0);
+-  svfloat64_t off = svdup_f64_z (is_neg, d->pi);
+-  svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0));
+-  svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2));
+-
+-  return svmla_x (pg, add, mul, y);
++  svfloat64_t mul = svreinterpret_f64 (
++      svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10));
++  mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign));
++  svfloat64_t add = svreinterpret_f64 (
++      svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->pi_over_2))));
++  add = svsub_m (a_gt_half, sv_f64 (d->pi_over_2), add);
++
++  return svmsb_x (pg, p, mul, add);
+ }
+diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
+index 326b2cca2e..3a84959f0a 100644
+--- a/sysdeps/aarch64/fpu/acosh_sve.c
++++ b/sysdeps/aarch64/fpu/acosh_sve.c
+@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+ }
+ 
+ /* SVE approximation for double-precision acosh, based on log1p.
+-   The largest observed error is 3.19 ULP in the region where the
++   The largest observed error is 3.14 ULP in the region where the
+    argument to log1p falls in the k=0 interval, i.e. x close to 1:
+-   SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
+-					   want 0x1.ed23399f51373p-2.  */
++   SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
++					   want 0x1.ef0cee7c33ce4p-2.  */
+ svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
+ {
+   /* (ix - One) >= (BigBound - One).  */
+diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
+index 414211627e..f74141c845 100644
+--- a/sysdeps/aarch64/fpu/asin_advsimd.c
++++ b/sysdeps/aarch64/fpu/asin_advsimd.c
+@@ -18,24 +18,23 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+ 
+ static const struct data
+ {
+-  float64x2_t poly[12];
++  float64x2_t c0, c2, c4, c6, c8, c10;
+   float64x2_t pi_over_2;
+   uint64x2_t abs_mask;
++  double c1, c3, c5, c7, c9, c11;
+ } data = {
+   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+-  .abs_mask = V2 (0x7fffffffffffffff),
++  .c0 = V2 (0x1.555555555554ep-3),	  .c1 = 0x1.3333333337233p-4,
++  .c2 = V2 (0x1.6db6db67f6d9fp-5),	  .c3 = 0x1.f1c71fbd29fbbp-6,
++  .c4 = V2 (0x1.6e8b264d467d6p-6),	  .c5 = 0x1.1c5997c357e9dp-6,
++  .c6 = V2 (0x1.c86a22cd9389dp-7),	  .c7 = 0x1.856073c22ebbep-7,
++  .c8 = V2 (0x1.fd1151acb6bedp-8),	  .c9 = 0x1.087182f799c1dp-6,
++  .c10 = V2 (-0x1.6602748120927p-7),	  .c11 = 0x1.cfa0dd1f9478p-6,
++  .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
+ };
+ 
+ #define AllMask v_u64 (0xffffffffffffffff)
+@@ -68,8 +67,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+      asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+ 
+    The largest observed error in this region is 2.69 ulps,
+-   _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+-				       want 0x1.110d7e85fdd53p-1.  */
++   _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
++				       want 0x1.1111dd54ddf99p-1.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -86,7 +85,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+     return special_case (x, x, AllMask);
+ #endif
+ 
+-  uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
++  uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
+ 
+   /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+      z = x ^ 2 and y = |x|            , if |x| < 0.5
+@@ -99,7 +98,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+   float64x2_t z4 = vmulq_f64 (z2, z2);
+   float64x2_t z8 = vmulq_f64 (z4, z4);
+   float64x2_t z16 = vmulq_f64 (z8, z8);
+-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
++
++  /* order-11 estrin.  */
++  float64x2_t c13 = vld1q_f64 (&d->c1);
++  float64x2_t c57 = vld1q_f64 (&d->c5);
++  float64x2_t c911 = vld1q_f64 (&d->c9);
++
++  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
++  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
++  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
++
++  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
++  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
++  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
++
++  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
++  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
++  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
++
++  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
++  float64x2_t p = vfmaq_f64 (p07, z16, p811);
+ 
+   /* Finalize polynomial: z + z * z2 * P(z2).  */
+   p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c
+index 9314466f58..975f408bee 100644
+--- a/sysdeps/aarch64/fpu/asin_sve.c
++++ b/sysdeps/aarch64/fpu/asin_sve.c
+@@ -18,45 +18,43 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[12];
+-  float64_t pi_over_2f;
++  float64_t c1, c3, c5, c7, c9, c11;
++  float64_t c0, c2, c4, c6, c8, c10;
++  float64_t pi_over_2;
+ } data = {
+   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+-  .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4,
+-	    0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6,
+-	    0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+-	    0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7,
+-	    0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6,
+-	    -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
+-  .pi_over_2f = 0x1.921fb54442d18p+0,
++  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
++  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
++  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
++  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
++  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
++  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
++  .pi_over_2 = 0x1.921fb54442d18p+0,
+ };
+ 
+-#define P(i) sv_f64 (d->poly[i])
+-
+ /* Double-precision SVE implementation of vector asin(x).
+ 
+    For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+    approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+ 
+-   The largest observed error in this region is 0.52 ulps,
+-   _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2
+-				      want 0x1.ec13757305f26p-2.
+-
+-   For |x| in [0.5, 1.0], use same approximation with a change of variable
++   The largest observed error in this region is 0.98 ulp:
++   _ZGVsMxv_asin (0x1.d98f6a748ed8ap-2) got 0x1.ec4eb661a73d3p-2
++				       want 0x1.ec4eb661a73d2p-2.
+ 
+-     asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
++   For |x| in [0.5, 1.0], use same approximation with a change of variable:
++   asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+ 
+-   The largest observed error in this region is 2.69 ulps,
+-   _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+-				      want 0x1.110d7e85fdd53p-1.  */
++   The largest observed error in this region is 2.66 ulp:
++   _ZGVsMxv_asin (0x1.04024f6e2a2fbp-1) got 0x1.10b9586f087a8p-1
++				       want 0x1.10b9586f087abp-1.  */
+ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
++  svbool_t ptrue = svptrue_b64 ();
+ 
+   svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+   svfloat64_t ax = svabs_x (pg, x);
+@@ -70,17 +68,37 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
+ 
+   /* Use a single polynomial approximation P for both intervals.  */
++  svfloat64_t z3 = svmul_x (pg, z2, z);
+   svfloat64_t z4 = svmul_x (pg, z2, z2);
+   svfloat64_t z8 = svmul_x (pg, z4, z4);
+-  svfloat64_t z16 = svmul_x (pg, z8, z8);
+-  svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
++
++  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
++  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
++  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
++
++  /* Order-11 Estrin scheme.  */
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
++  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
++
++  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
++  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
++
++  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
++  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
++  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
++
++  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
++  svfloat64_t p = svmla_x (pg, p03, z8, p411);
++
+   /* Finalize polynomial: z + z * z2 * P(z2).  */
+-  p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
++  p = svmla_x (pg, z, z3, p);
+ 
+-  /* asin(|x|) = Q(|x|)         , for |x| < 0.5
+-	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+-  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f);
++  /* asin(|x|) = Q(|x|), for |x| <  0.5
++	    = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
++  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2);
+ 
+-  /* Copy sign.  */
++  /* Reinsert the sign from the argument.  */
+   return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+ }
+diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
+index 52c7c0ec6e..013936c2c0 100644
+--- a/sysdeps/aarch64/fpu/asinf_advsimd.c
++++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
+@@ -18,22 +18,21 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
++  float32x4_t c0, c2, c4;
++  float c1, c3;
+   float32x4_t pi_over_2f;
+ } data = {
+   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))  on
+      [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 .  */
+-  .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+-	    V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+-  .pi_over_2f = V4 (0x1.921fb6p+0f),
++  .c0 = V4 (0x1.55555ep-3f), .c1 = 0x1.33261ap-4f,
++  .c2 = V4 (0x1.70d7dcp-5f), .c3 = 0x1.b059dp-6f,
++  .c4 = V4 (0x1.3af7d8p-5f), .pi_over_2f = V4 (0x1.921fb6p+0f),
+ };
+ 
+ #define AbsMask 0x7fffffff
+-#define Half 0x3f000000
+ #define One 0x3f800000
+ #define Small 0x39800000 /* 2^-12.  */
+ 
+@@ -47,11 +46,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+ 
+ /* Single-precision implementation of vector asin(x).
+ 
+-   For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+-   rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+-   following approximation.
+ 
+-   For |x| in [Small, 0.5], use order 4 polynomial P such that the final
++   For |x| <0.5, use order 4 polynomial P such that the final
+    approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+ 
+     The largest observed error in this region is 0.83 ulps,
+@@ -80,24 +76,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
+ #endif
+ 
+   float32x4_t ax = vreinterpretq_f32_u32 (ia);
+-  uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
++  uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f));
+ 
+   /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+      z = x ^ 2 and y = |x|            , if |x| < 0.5
+      z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+   float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
+-			      vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
++			      vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f));
+   float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
+ 
+   /* Use a single polynomial approximation P for both intervals.  */
+-  float32x4_t p = v_horner_4_f32 (z2, d->poly);
++
++  /* PW Horner 3 evaluation scheme.  */
++  float32x4_t z4 = vmulq_f32 (z2, z2);
++  float32x4_t c13 = vld1q_f32 (&d->c1);
++  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c13, 0);
++  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c13, 1);
++  float32x4_t p = vfmaq_f32 (p23, d->c4, z4);
++  p = vfmaq_f32 (p01, p, z4);
+   /* Finalize polynomial: z + z * z2 * P(z2).  */
+   p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+ 
+   /* asin(|x|) = Q(|x|)         , for |x| < 0.5
+ 	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+   float32x4_t y
+-      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
++      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0f));
+ 
+   /* Copy sign.  */
+   return vbslq_f32 (v_u32 (AbsMask), y, x);
 diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
 index 0889f79dbb..ff6b71390c 100644
 --- a/sysdeps/aarch64/fpu/asinh_sve.c
@@ -2607,69 +3661,1326 @@
 +  svfloat64_t y = svsel (ge1, option_1, option_2);
    return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
  }
+diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
+index 00b4a4f083..a31d52f3ac 100644
+--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
++++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
+@@ -19,40 +19,38 @@
+ 
+ #include "math_config.h"
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+ 
+ static const struct data
+ {
++  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+   float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+   float64x2_t pi_over_2;
+-  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+-  uint64x2_t zeroinfnan, minustwo;
++  uint64x2_t zeroinfnan;
+ } data = {
+-  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+-	      [2**-1022, 1.0].  */
+-  .c0 = V2 (-0x1.5555555555555p-2),
+-  .c1 = 0x1.99999999996c1p-3,
+-  .c2 = V2 (-0x1.2492492478f88p-3),
+-  .c3 = 0x1.c71c71bc3951cp-4,
+-  .c4 = V2 (-0x1.745d160a7e368p-4),
+-  .c5 = 0x1.3b139b6a88ba1p-4,
+-  .c6 = V2 (-0x1.11100ee084227p-4),
+-  .c7 = 0x1.e1d0f9696f63bp-5,
+-  .c8 = V2 (-0x1.aebfe7b418581p-5),
+-  .c9 = 0x1.842dbe9b0d916p-5,
+-  .c10 = V2 (-0x1.5d30140ae5e99p-5),
+-  .c11 = 0x1.338e31eb2fbbcp-5,
+-  .c12 = V2 (-0x1.00e6eece7de8p-5),
+-  .c13 = 0x1.860897b29e5efp-6,
+-  .c14 = V2 (-0x1.0051381722a59p-6),
+-  .c15 = 0x1.14e9dc19a4a4ep-7,
+-  .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+-  .c17 = 0x1.17739e210171ap-10,
+-  .c18 = V2 (-0x1.ab24da7be7402p-13),
+-  .c19 = 0x1.358851160a528p-16,
++  /* Coefficients of polynomial P such that
++     atan(x)~x+x*P(x^2) on [2^-1022, 1.0].  */
++  .c0 = V2 (-0x1.555555555552ap-2),
++  .c1 = 0x1.9999999995aebp-3,
++  .c2 = V2 (-0x1.24924923923f6p-3),
++  .c3 = 0x1.c71c7184288a2p-4,
++  .c4 = V2 (-0x1.745d11fb3d32bp-4),
++  .c5 = 0x1.3b136a18051b9p-4,
++  .c6 = V2 (-0x1.110e6d985f496p-4),
++  .c7 = 0x1.e1bcf7f08801dp-5,
++  .c8 = V2 (-0x1.ae644e28058c3p-5),
++  .c9 = 0x1.82eeb1fed85c6p-5,
++  .c10 = V2 (-0x1.59d7f901566cbp-5),
++  .c11 = 0x1.2c982855ab069p-5,
++  .c12 = V2 (-0x1.eb49592998177p-6),
++  .c13 = 0x1.69d8b396e3d38p-6,
++  .c14 = V2 (-0x1.ca980345c4204p-7),
++  .c15 = 0x1.dc050eafde0b3p-8,
++  .c16 = V2 (-0x1.7ea70755b8eccp-9),
++  .c17 = 0x1.ba3da3de903e8p-11,
++  .c18 = V2 (-0x1.44a4b059b6f67p-13),
++  .c19 = 0x1.c4a45029e5a91p-17,
+   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+   .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+-  .minustwo = V2 (0xc000000000000000),
+ };
+ 
+ #define SignMask v_u64 (0x8000000000000000)
+@@ -77,10 +75,9 @@ zeroinfnan (uint64x2_t i, const struct data *d)
+ }
+ 
+ /* Fast implementation of vector atan2.
+-   Maximum observed error is 2.8 ulps:
+-   _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+-	got 0x1.92d628ab678ccp-1
+-       want 0x1.92d628ab678cfp-1.  */
++   Maximum observed error is 1.97 ulps:
++   _ZGVnN2vv_atan2 (0x1.42337dba73768p+5, 0x1.422d748cd3e29p+5)
++   got 0x1.9224810264efcp-1 want 0x1.9224810264efep-1.  */
+ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -101,26 +98,29 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+   uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+   uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
+ 
+-  /* Set up z for call to atan.  */
+-  float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+-  float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+-  float64x2_t z = vdivq_f64 (n, q);
+-
+-  /* Work out the correct shift.  */
+-  float64x2_t shift
+-      = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
+-  shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+-  shift = vmulq_f64 (shift, d->pi_over_2);
+-
+-  /* Calculate the polynomial approximation.
+-     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+-     full scheme to avoid underflow in x^16.
+-     The order 19 polynomial P approximates
+-     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
++  /* Set up z for evaluation of atan.  */
++  float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
++  float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax);
++  float64x2_t z = vdivq_f64 (num, den);
++
++  /* Work out the correct shift for atan2:
++     Multiplication by pi is done later.
++     -pi   when x < 0  and ax < ay
++     -pi/2 when x < 0  and ax > ay
++      0    when x >= 0 and ax < ay
++      pi/2 when x >= 0 and ax > ay.  */
++  float64x2_t shift = vreinterpretq_f64_u64 (
++      vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
++  float64x2_t shift2 = vreinterpretq_f64_u64 (
++      vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (1.0))));
++  shift = vaddq_f64 (shift, shift2);
++
++  /* Calculate the polynomial approximation.  */
+   float64x2_t z2 = vmulq_f64 (z, z);
+-  float64x2_t x2 = vmulq_f64 (z2, z2);
+-  float64x2_t x4 = vmulq_f64 (x2, x2);
+-  float64x2_t x8 = vmulq_f64 (x4, x4);
++  float64x2_t z3 = vmulq_f64 (z2, z);
++  float64x2_t z4 = vmulq_f64 (z2, z2);
++  float64x2_t z8 = vmulq_f64 (z4, z4);
++  float64x2_t z16 = vmulq_f64 (z8, z8);
+ 
+   float64x2_t c13 = vld1q_f64 (&d->c1);
+   float64x2_t c57 = vld1q_f64 (&d->c5);
+@@ -128,45 +128,43 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+   float64x2_t c1315 = vld1q_f64 (&d->c13);
+   float64x2_t c1719 = vld1q_f64 (&d->c17);
+ 
+-  /* estrin_7.  */
++  /* Order-7 Estrin.  */
+   float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+   float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+-  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
++  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+ 
+   float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+   float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+-  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
++  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+ 
+-  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
++  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+ 
+-  /* estrin_11.  */
++  /* Order-11 Estrin.  */
+   float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+   float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+-  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
++  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+ 
+   float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+   float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+-  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
++  float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415);
+ 
+   float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+   float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+-  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
++  float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819);
+ 
+-  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+-  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
++  float64x2_t p815 = vfmaq_f64 (p811, z8, p1215);
++  float64x2_t p819 = vfmaq_f64 (p815, z16, p1619);
+ 
+-  float64x2_t ret = vfmaq_f64 (p07, p819, x8);
++  float64x2_t poly = vfmaq_f64 (p07, p819, z16);
+ 
+   /* Finalize. y = shift + z + z^3 * P(z^2).  */
+-  ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+-  ret = vaddq_f64 (ret, shift);
++  float64x2_t ret = vfmaq_f64 (z, shift, d->pi_over_2);
++  ret = vfmaq_f64 (ret, z3, poly);
+ 
+   if (__glibc_unlikely (v_any_u64 (special_cases)))
+     return special_case (y, x, ret, sign_xy, special_cases);
+ 
+   /* Account for the sign of x and y.  */
+-  ret = vreinterpretq_f64_u64 (
++  return vreinterpretq_f64_u64 (
+       veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+-
+-  return ret;
+ }
+diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
+index 163f61308b..9e2dd249d4 100644
+--- a/sysdeps/aarch64/fpu/atan2_sve.c
++++ b/sysdeps/aarch64/fpu/atan2_sve.c
+@@ -19,25 +19,25 @@
+ 
+ #include "math_config.h"
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[20];
+-  float64_t pi_over_2;
++  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
++  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+      [2**-1022, 1.0].  */
+-  .poly = { -0x1.5555555555555p-2,  0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
+-            0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+-            -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
+-            0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+-            -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
+-            0x1.14e9dc19a4a4ep-7,  -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+-            -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
+-  .pi_over_2 = 0x1.921fb54442d18p+0,
++  .c0 = -0x1.555555555552ap-2,	 .c1 = 0x1.9999999995aebp-3,
++  .c2 = -0x1.24924923923f6p-3,	 .c3 = 0x1.c71c7184288a2p-4,
++  .c4 = -0x1.745d11fb3d32bp-4,	 .c5 = 0x1.3b136a18051b9p-4,
++  .c6 = -0x1.110e6d985f496p-4,	 .c7 = 0x1.e1bcf7f08801dp-5,
++  .c8 = -0x1.ae644e28058c3p-5,	 .c9 = 0x1.82eeb1fed85c6p-5,
++  .c10 = -0x1.59d7f901566cbp-5,	 .c11 = 0x1.2c982855ab069p-5,
++  .c12 = -0x1.eb49592998177p-6,	 .c13 = 0x1.69d8b396e3d38p-6,
++  .c14 = -0x1.ca980345c4204p-7,	 .c15 = 0x1.dc050eafde0b3p-8,
++  .c16 = -0x1.7ea70755b8eccp-9,	 .c17 = 0x1.ba3da3de903e8p-11,
++  .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17,
+ };
+-
+ /* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+ static svfloat64_t NOINLINE
+ special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
+@@ -56,15 +56,17 @@ zeroinfnan (svuint64_t i, const svbool_t pg)
+ }
+ 
+ /* Fast implementation of SVE atan2. Errors are greatest when y and
+-   x are reasonably close together. The greatest observed error is 2.28 ULP:
+-   _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+-   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
+-svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
++   x are reasonably close together. The greatest observed error is 1.94 ULP:
++   _ZGVsMxvv_atan2 (0x1.8a4bf7167228ap+5, 0x1.84971226bb57bp+5)
++   got 0x1.95db19dfef9ccp-1 want 0x1.95db19dfef9cep-1.  */
++svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x,
++				const svbool_t pg)
+ {
+-  const struct data *data_ptr = ptr_barrier (&data);
++  const struct data *d = ptr_barrier (&data);
+ 
+   svuint64_t ix = svreinterpret_u64 (x);
+   svuint64_t iy = svreinterpret_u64 (y);
++  svbool_t ptrue = svptrue_b64 ();
+ 
+   svbool_t cmp_x = zeroinfnan (ix, pg);
+   svbool_t cmp_y = zeroinfnan (iy, pg);
+@@ -81,32 +83,67 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
+ 
+   svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
+ 
+-  /* Set up z for call to atan.  */
+-  svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+-  svfloat64_t d = svsel (pred_aygtax, ay, ax);
+-  svfloat64_t z = svdiv_x (pg, n, d);
+-
+-  /* Work out the correct shift.  */
++  /* Set up z for evaluation of atan.  */
++  svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
++  svfloat64_t den = svsel (pred_aygtax, ay, ax);
++  svfloat64_t z = svdiv_x (pg, num, den);
++
++  /* Work out the correct shift for atan2:
++     Multiplication by pi is done later.
++     -pi   when x < 0  and ax < ay
++     -pi/2 when x < 0  and ax > ay
++      0    when x >= 0 and ax < ay
++      pi/2 when x >= 0 and ax > ay.  */
+   svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
++  svfloat64_t shift_mul = svreinterpret_f64 (
++      svorr_x (pg, sign_x, svreinterpret_u64 (sv_f64 (0x1.921fb54442d18p+0))));
+   shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
+-  shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
+-  shift = svmul_x (pg, shift, data_ptr->pi_over_2);
++  shift = svmla_x (pg, z, shift, shift_mul);
+ 
+   /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
+   svfloat64_t z2 = svmul_x (pg, z, z);
+-  svfloat64_t x2 = svmul_x (pg, z2, z2);
+-  svfloat64_t x4 = svmul_x (pg, x2, x2);
+-  svfloat64_t x8 = svmul_x (pg, x4, x4);
++  svfloat64_t z3 = svmul_x (pg, z2, z);
++  svfloat64_t z4 = svmul_x (pg, z2, z2);
++  svfloat64_t z8 = svmul_x (pg, z4, z4);
++  svfloat64_t z16 = svmul_x (pg, z8, z8);
+ 
+-  svfloat64_t ret = svmla_x (
+-      pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly),
+-      sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8);
++  /* Order-7 Estrin.  */
++  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
++  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+ 
+-  /* y = shift + z + z^3 * P(z^2).  */
+-  svfloat64_t z3 = svmul_x (pg, z2, z);
+-  ret = svmla_x (pg, z, z3, ret);
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
++  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
++
++  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
++  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
++  svfloat64_t p07 = svmla_x (pg, p03, z8, p47);
++
++  /* Order-11 Estrin.  */
++  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
++  svfloat64_t c1315 = svld1rq (ptrue, &d->c13);
++  svfloat64_t c1719 = svld1rq (ptrue, &d->c17);
+ 
+-  ret = svadd_m (pg, ret, shift);
++  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
++  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
++  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
++
++  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0);
++  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1);
++  svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415);
++
++  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0);
++  svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1);
++  svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819);
++
++  svfloat64_t p815 = svmla_x (pg, p811, z8, p1215);
++  svfloat64_t p819 = svmla_x (pg, p815, z16, p1619);
++
++  svfloat64_t poly = svmla_x (pg, p07, z16, p819);
++
++  /* y = shift + z + z^3 * P(z^2).  */
++  svfloat64_t ret = svmla_x (pg, shift, z3, poly);
+ 
+   /* Account for the sign of x and y.  */
+   if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
+diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
+index e65406f492..75d873897a 100644
+--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
++++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
+@@ -18,22 +18,22 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+ 
+ static const struct data
+ {
+-  float32x4_t c0, pi_over_2, c4, c6, c2;
++  float32x4_t c0, c4, c6, c2;
+   float c1, c3, c5, c7;
+   uint32x4_t comp_const;
++  float32x4_t pi;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+      [2**-128, 1.0].
+      Generated using fpminimax between FLT_MIN and 1.  */
+-  .c0 = V4 (-0x1.55555p-2f),	    .c1 = 0x1.99935ep-3f,
+-  .c2 = V4 (-0x1.24051ep-3f),	    .c3 = 0x1.bd7368p-4f,
+-  .c4 = V4 (-0x1.491f0ep-4f),	    .c5 = 0x1.93a2c0p-5f,
+-  .c6 = V4 (-0x1.4c3c60p-6f),	    .c7 = 0x1.01fd88p-8f,
+-  .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
++  .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3,
++  .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4,
++  .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5,
++  .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9,
++  .pi = V4 (0x1.921fb6p+1f), .comp_const = V4 (2 * 0x7f800000lu - 1),
+ };
+ 
+ #define SignMask v_u32 (0x80000000)
+@@ -54,13 +54,13 @@ static inline uint32x4_t
+ zeroinfnan (uint32x4_t i, const struct data *d)
+ {
+   /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
+-  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
++  return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const);
+ }
+ 
+ /* Fast implementation of vector atan2f. Maximum observed error is
+-   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+-   _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+-						 want 0x1.967f00p-1.  */
++   2.13 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
++   _ZGVnN4vv_atan2f (0x1.14a9d4p-87, 0x1.0eb886p-87) got 0x1.97aea2p-1
++						    want 0x1.97ae9ep-1.  */
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -81,28 +81,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+   uint32x4_t pred_xlt0 = vcltzq_f32 (x);
+   uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
+ 
+-  /* Set up z for call to atanf.  */
+-  float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+-  float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+-  float32x4_t z = vdivq_f32 (n, q);
+-
+-  /* Work out the correct shift.  */
++  /* Set up z for evaluation of atanf.  */
++  float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
++  float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax);
++  float32x4_t z = vdivq_f32 (num, den);
++
++  /* Work out the correct shift for atan2:
++     Multiplication by pi is done later.
++     -pi   when x < 0  and ax < ay
++     -pi/2 when x < 0  and ax > ay
++      0    when x >= 0 and ax < ay
++      pi/2 when x >= 0 and ax > ay.  */
+   float32x4_t shift = vreinterpretq_f32_u32 (
+-      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
+-  shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
+-  shift = vmulq_f32 (shift, d->pi_over_2);
+-
+-  /* Calculate the polynomial approximation.
+-     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+-     a standard implementation using z8 creates spurious underflow
+-     in the very last fma (when z^8 is small enough).
+-     Therefore, we split the last fma into a mul and an fma.
+-     Horner and single-level Estrin have higher errors that exceed
+-     threshold.  */
++      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f))));
++  float32x4_t shift2 = vreinterpretq_f32_u32 (
++      vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f))));
++  shift = vaddq_f32 (shift, shift2);
++
++  /* Calculate the polynomial approximation.  */
+   float32x4_t z2 = vmulq_f32 (z, z);
++  float32x4_t z3 = vmulq_f32 (z2, z);
+   float32x4_t z4 = vmulq_f32 (z2, z2);
++  float32x4_t z8 = vmulq_f32 (z4, z4);
+ 
+   float32x4_t c1357 = vld1q_f32 (&d->c1);
++
+   float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+   float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+   float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+@@ -110,10 +113,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+   float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+   float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+ 
+-  float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
++  float32x4_t poly = vfmaq_f32 (p03, z8, p47);
+ 
+   /* y = shift + z * P(z^2).  */
+-  ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
++  float32x4_t ret = vfmaq_f32 (z, shift, d->pi);
++  ret = vfmaq_f32 (ret, z3, poly);
+ 
+   if (__glibc_unlikely (v_any_u32 (special_cases)))
+     {
+diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
+index 5f26e2a365..4d9341952d 100644
+--- a/sysdeps/aarch64/fpu/atan2f_sve.c
++++ b/sysdeps/aarch64/fpu/atan2f_sve.c
+@@ -18,18 +18,18 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+ static const struct data
+ {
+-  float32_t poly[8];
++  float32_t c0, c2, c4, c6;
++  float32_t c1, c3, c5, c7;
+   float32_t pi_over_2;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+      [2**-128, 1.0].  */
+-  .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+-	    -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
+-  .pi_over_2 = 0x1.921fb6p+0f,
++  .c0 = -0x1.5554dcp-2, .c1 = 0x1.9978ecp-3,  .c2 = -0x1.230a94p-3,
++  .c3 = 0x1.b4debp-4,	.c4 = -0x1.3550dap-4, .c5 = 0x1.61eebp-5,
++  .c6 = -0x1.0c17d4p-6, .c7 = 0x1.7ea694p-9,  .pi_over_2 = 0x1.921fb6p+0f,
+ };
+ 
+ /* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+@@ -51,12 +51,14 @@ zeroinfnan (svuint32_t i, const svbool_t pg)
+ 
+ /* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 *
+    P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum
+-   observed error is 2.95 ULP:
+-   _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+-						 want 0x1.967f00p-1.  */
+-svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
++   observed error is 2.21 ULP:
++   _ZGVnN4vv_atan2f (0x1.a04aa8p+6, 0x1.9a274p+6) got 0x1.95ed3ap-1
++						 want 0x1.95ed36p-1.  */
++svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x,
++				const svbool_t pg)
+ {
+-  const struct data *data_ptr = ptr_barrier (&data);
++  const struct data *d = ptr_barrier (&data);
++  svbool_t ptrue = svptrue_b32 ();
+ 
+   svuint32_t ix = svreinterpret_u32 (x);
+   svuint32_t iy = svreinterpret_u32 (y);
+@@ -76,29 +78,42 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
+ 
+   svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
+ 
+-  /* Set up z for call to atan.  */
+-  svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+-  svfloat32_t d = svsel (pred_aygtax, ay, ax);
+-  svfloat32_t z = svdiv_x (pg, n, d);
+-
+-  /* Work out the correct shift.  */
++  /* Set up z for evaluation of atanf.  */
++  svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
++  svfloat32_t den = svsel (pred_aygtax, ay, ax);
++  svfloat32_t z = svdiv_x (ptrue, num, den);
++
++  /* Work out the correct shift for atan2:
++     Multiplication by pi is done later.
++     -pi   when x < 0  and ax < ay
++     -pi/2 when x < 0  and ax > ay
++      0    when x >= 0 and ax < ay
++      pi/2 when x >= 0 and ax > ay.  */
+   svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
+   shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
+   shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
+-  shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
+ 
+   /* Use pure Estrin scheme for P(z^2) with deg(P)=7.  */
+-  svfloat32_t z2 = svmul_x (pg, z, z);
++  svfloat32_t z2 = svmul_x (ptrue, z, z);
++  svfloat32_t z3 = svmul_x (pg, z2, z);
+   svfloat32_t z4 = svmul_x (pg, z2, z2);
+   svfloat32_t z8 = svmul_x (pg, z4, z4);
+ 
+-  svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly);
++  svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1);
+ 
+-  /* ret = shift + z + z^3 * P(z^2).  */
+-  svfloat32_t z3 = svmul_x (pg, z2, z);
+-  ret = svmla_x (pg, z, z3, ret);
++  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
++  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
++  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
++  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3);
+ 
+-  ret = svadd_m (pg, ret, shift);
++  svfloat32_t p03 = svmla_x (pg, p01, z4, p23);
++  svfloat32_t p47 = svmla_x (pg, p45, z4, p67);
++
++  svfloat32_t poly = svmla_x (pg, p03, z8, p47);
++
++  /* ret = shift + z + z^3 * P(z^2).  */
++  svfloat32_t ret = svmla_x (pg, z, shift, sv_f32 (d->pi_over_2));
++  ret = svmla_x (pg, ret, z3, poly);
+ 
+   /* Account for the sign of x and y.  */
+ 
+diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
+index f024fd1d74..da0d3715df 100644
+--- a/sysdeps/aarch64/fpu/atan_advsimd.c
++++ b/sysdeps/aarch64/fpu/atan_advsimd.c
+@@ -18,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+ 
+ static const struct data
+ {
+@@ -28,16 +27,16 @@ static const struct data
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ 	      [2**-1022, 1.0].  */
+-  .c0 = V2 (-0x1.5555555555555p-2),	  .c1 = 0x1.99999999996c1p-3,
+-  .c2 = V2 (-0x1.2492492478f88p-3),	  .c3 = 0x1.c71c71bc3951cp-4,
+-  .c4 = V2 (-0x1.745d160a7e368p-4),	  .c5 = 0x1.3b139b6a88ba1p-4,
+-  .c6 = V2 (-0x1.11100ee084227p-4),	  .c7 = 0x1.e1d0f9696f63bp-5,
+-  .c8 = V2 (-0x1.aebfe7b418581p-5),	  .c9 = 0x1.842dbe9b0d916p-5,
+-  .c10 = V2 (-0x1.5d30140ae5e99p-5),	  .c11 = 0x1.338e31eb2fbbcp-5,
+-  .c12 = V2 (-0x1.00e6eece7de8p-5),	  .c13 = 0x1.860897b29e5efp-6,
+-  .c14 = V2 (-0x1.0051381722a59p-6),	  .c15 = 0x1.14e9dc19a4a4ep-7,
+-  .c16 = V2 (-0x1.d0062b42fe3bfp-9),	  .c17 = 0x1.17739e210171ap-10,
+-  .c18 = V2 (-0x1.ab24da7be7402p-13),	  .c19 = 0x1.358851160a528p-16,
++  .c0 = V2 (-0x1.555555555552ap-2),	  .c1 = 0x1.9999999995aebp-3,
++  .c2 = V2 (-0x1.24924923923f6p-3),	  .c3 = 0x1.c71c7184288a2p-4,
++  .c4 = V2 (-0x1.745d11fb3d32bp-4),	  .c5 = 0x1.3b136a18051b9p-4,
++  .c6 = V2 (-0x1.110e6d985f496p-4),	  .c7 = 0x1.e1bcf7f08801dp-5,
++  .c8 = V2 (-0x1.ae644e28058c3p-5),	  .c9 = 0x1.82eeb1fed85c6p-5,
++  .c10 = V2 (-0x1.59d7f901566cbp-5),	  .c11 = 0x1.2c982855ab069p-5,
++  .c12 = V2 (-0x1.eb49592998177p-6),	  .c13 = 0x1.69d8b396e3d38p-6,
++  .c14 = V2 (-0x1.ca980345c4204p-7),	  .c15 = 0x1.dc050eafde0b3p-8,
++  .c16 = V2 (-0x1.7ea70755b8eccp-9),	  .c17 = 0x1.ba3da3de903e8p-11,
++  .c18 = V2 (-0x1.44a4b059b6f67p-13),	  .c19 = 0x1.c4a45029e5a91p-17,
+   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ };
+ 
+@@ -47,9 +46,9 @@ static const struct data
+ 
+ /* Fast implementation of vector atan.
+    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+-   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+-   _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+-				       want 0x1.9225645bdd7c3p-1.  */
++   z=1/x and shift = pi/2. Maximum observed error is 2.45 ulps:
++   _ZGVnN2v_atan (0x1.0008d737eb3e6p+0) got 0x1.92288c551a4c1p-1
++				       want 0x1.92288c551a4c3p-1.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -78,59 +77,53 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
+      y := arctan(x) for x < 1
+      y := pi/2 + arctan(-1/x) for x > 1
+      Hence, use z=-1/a if x>=1, otherwise z=a.  */
+-  uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
++  uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0));
+   /* Avoid dependency in abs(x) in division (and comparison).  */
+-  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
++  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x);
++
+   float64x2_t shift = vreinterpretq_f64_u64 (
+       vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
+-  /* Use absolute value only when needed (odd powers of z).  */
+-  float64x2_t az = vbslq_f64 (
+-      SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
+-
+-  /* Calculate the polynomial approximation.
+-     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+-     full scheme to avoid underflow in x^16.
+-     The order 19 polynomial P approximates
+-     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
++
++  /* Reinsert sign bit from argument into the shift value.  */
++  shift = vreinterpretq_f64_u64 (
++      veorq_u64 (vreinterpretq_u64_f64 (shift), sign));
++
++  /* Calculate polynomial approximation P(z^2) with deg(P)=19.  */
+   float64x2_t z2 = vmulq_f64 (z, z);
+-  float64x2_t x2 = vmulq_f64 (z2, z2);
+-  float64x2_t x4 = vmulq_f64 (x2, x2);
+-  float64x2_t x8 = vmulq_f64 (x4, x4);
++  float64x2_t z4 = vmulq_f64 (z2, z2);
++  float64x2_t z8 = vmulq_f64 (z4, z4);
++  float64x2_t z16 = vmulq_f64 (z8, z8);
+ 
+-  /* estrin_7.  */
++  /* Order-7 Estrin.  */
+   float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+   float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+-  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
++  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+ 
+   float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+   float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+-  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
++  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+ 
+-  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
++  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+ 
+-  /* estrin_11.  */
++  /* Order-11 Estrin.  */
+   float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+   float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+-  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
++  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+ 
+   float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+   float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+-  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
++  float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415);
+ 
+   float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+   float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+-  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
++  float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819);
+ 
+-  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+-  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
++  float64x2_t p815 = vfmaq_f64 (p811, z8, p1215);
++  float64x2_t p819 = vfmaq_f64 (p815, z16, p1619);
+ 
+-  float64x2_t y = vfmaq_f64 (p07, p819, x8);
++  float64x2_t y = vfmaq_f64 (p07, p819, z16);
+ 
+   /* Finalize. y = shift + z + z^3 * P(z^2).  */
+-  y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
+-  y = vaddq_f64 (y, shift);
+-
+-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+-  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
+-  return y;
++  y = vfmsq_f64 (v_f64 (-1.0), z2, y);
++  return vfmsq_f64 (shift, z, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c
+index 3880cedff4..a6b0489cf6 100644
+--- a/sysdeps/aarch64/fpu/atan_sve.c
++++ b/sysdeps/aarch64/fpu/atan_sve.c
+@@ -18,23 +18,26 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[20];
+-  float64_t pi_over_2;
++  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
++  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
++  float64_t shift_val, neg_one;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+      [2**-1022, 1.0].  */
+-  .poly = { -0x1.5555555555555p-2,  0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
+-            0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+-            -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
+-            0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+-            -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
+-            0x1.14e9dc19a4a4ep-7,  -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+-            -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
+-  .pi_over_2 = 0x1.921fb54442d18p+0,
++  .c0 = -0x1.555555555552ap-2,	     .c1 = 0x1.9999999995aebp-3,
++  .c2 = -0x1.24924923923f6p-3,	     .c3 = 0x1.c71c7184288a2p-4,
++  .c4 = -0x1.745d11fb3d32bp-4,	     .c5 = 0x1.3b136a18051b9p-4,
++  .c6 = -0x1.110e6d985f496p-4,	     .c7 = 0x1.e1bcf7f08801dp-5,
++  .c8 = -0x1.ae644e28058c3p-5,	     .c9 = 0x1.82eeb1fed85c6p-5,
++  .c10 = -0x1.59d7f901566cbp-5,	     .c11 = 0x1.2c982855ab069p-5,
++  .c12 = -0x1.eb49592998177p-6,	     .c13 = 0x1.69d8b396e3d38p-6,
++  .c14 = -0x1.ca980345c4204p-7,	     .c15 = 0x1.dc050eafde0b3p-8,
++  .c16 = -0x1.7ea70755b8eccp-9,	     .c17 = 0x1.ba3da3de903e8p-11,
++  .c18 = -0x1.44a4b059b6f67p-13,     .c19 = 0x1.c4a45029e5a91p-17,
++  .shift_val = 0x1.490fdaa22168cp+1, .neg_one = -1,
+ };
+ 
+ /* Useful constants.  */
+@@ -43,15 +46,14 @@ static const struct data
+ /* Fast implementation of SVE atan.
+    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+    z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
+-   error is 2.27 ulps:
+-   _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+-				       want 0x1.9225645bdd7c3p-1.  */
++   error is 2.08 ulps:
++   _ZGVsMxv_atan (0x1.000a7c56975e8p+0) got 0x1.922a3163e15c2p-1
++				       want 0x1.922a3163e15c4p-1.  */
+ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* No need to trigger special case. Small cases, infs and nans
+-     are supported by our approximation technique.  */
++  svbool_t ptrue = svptrue_b64 ();
+   svuint64_t ix = svreinterpret_u64 (x);
+   svuint64_t sign = svand_x (pg, ix, SignMask);
+ 
+@@ -59,32 +61,60 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
+      y := arctan(x) for x < 1
+      y := pi/2 + arctan(-1/x) for x > 1
+      Hence, use z=-1/a if x>=1, otherwise z=a.  */
+-  svbool_t red = svacgt (pg, x, 1.0);
+-  /* Avoid dependency in abs(x) in division (and comparison).  */
+-  svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x);
+-  /* Use absolute value only when needed (odd powers of z).  */
+-  svfloat64_t az = svabs_x (pg, z);
+-  az = svneg_m (az, red, az);
++  svbool_t red = svacgt (pg, x, d->neg_one);
++  svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x);
++
++  /* Reuse of -1.0f to reduce constant loads,
++     We need a shift value of 1/2, which is created via -1 + (1 + 1/2).  */
++  svfloat64_t shift
++      = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val));
++
++  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
++  shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign));
+ 
+   /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
+-  svfloat64_t z2 = svmul_x (pg, z, z);
+-  svfloat64_t x2 = svmul_x (pg, z2, z2);
+-  svfloat64_t x4 = svmul_x (pg, x2, x2);
+-  svfloat64_t x8 = svmul_x (pg, x4, x4);
++  svfloat64_t z2 = svmul_x (ptrue, z, z);
++  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
++  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
++  svfloat64_t z16 = svmul_x (ptrue, z8, z8);
+ 
+-  svfloat64_t y
+-      = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly),
+-		 sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8);
++  /* Order-7 Estrin.  */
++  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
++  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+ 
+-  /* y = shift + z + z^3 * P(z^2).  */
+-  svfloat64_t z3 = svmul_x (pg, z2, az);
+-  y = svmla_x (pg, az, z3, y);
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
++  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
++
++  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
++  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
++  svfloat64_t p07 = svmla_x (pg, p03, z8, p47);
++
++  /* Order-11 Estrin.  */
++  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
++  svfloat64_t c1315 = svld1rq (ptrue, &d->c13);
++  svfloat64_t c1719 = svld1rq (ptrue, &d->c17);
+ 
+-  /* Apply shift as indicated by `red` predicate.  */
+-  y = svadd_m (red, y, d->pi_over_2);
++  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
++  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
++  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+ 
+-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+-  y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
++  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0);
++  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1);
++  svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415);
+ 
+-  return y;
++  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0);
++  svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1);
++  svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819);
++
++  svfloat64_t p815 = svmla_x (pg, p811, z8, p1215);
++  svfloat64_t p819 = svmla_x (pg, p815, z16, p1619);
++
++  svfloat64_t y = svmla_x (pg, p07, z16, p819);
++
++  /* y = shift + z + z^3 * P(z^2).  */
++  shift = svadd_m (red, z, shift);
++  y = svmul_x (pg, z2, y);
++  return svmla_x (pg, shift, z, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
+index 472865ed74..817a47ef3e 100644
+--- a/sysdeps/aarch64/fpu/atanf_advsimd.c
++++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
+@@ -22,26 +22,35 @@
+ 
+ static const struct data
+ {
++  uint32x4_t sign_mask, pi_over_2;
++  float32x4_t neg_one;
++#if WANT_SIMD_EXCEPT
+   float32x4_t poly[8];
+-  float32x4_t pi_over_2;
++} data = {
++  .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3),
++	    V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5),
++	    V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) },
++#else
++  float32x4_t c0, c2, c4, c6;
++  float c1, c3, c5, c7;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+      [2**-128, 1.0].
+      Generated using fpminimax between FLT_MIN and 1.  */
+-  .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+-	    V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+-	    V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+-  .pi_over_2 = V4 (0x1.921fb6p+0f),
++  .c0 = V4 (-0x1.5554dcp-2),	.c1 = 0x1.9978ecp-3,
++  .c2 = V4 (-0x1.230a94p-3),	.c3 = 0x1.b4debp-4,
++  .c4 = V4 (-0x1.3550dap-4),	.c5 = 0x1.61eebp-5,
++  .c6 = V4 (-0x1.0c17d4p-6),	.c7 = 0x1.7ea694p-9,
++#endif
++  .pi_over_2 = V4 (0x3fc90fdb),
++  .neg_one = V4 (-1.0f),
++  .sign_mask = V4 (0x80000000),
+ };
+ 
+-#define SignMask v_u32 (0x80000000)
+-
+-#define P(i) d->poly[i]
+-
++#if WANT_SIMD_EXCEPT
+ #define TinyBound 0x30800000 /* asuint(0x1p-30).  */
+ #define BigBound 0x4e800000  /* asuint(0x1p30).  */
+ 
+-#if WANT_SIMD_EXCEPT
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+ {
+@@ -51,19 +60,20 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+ 
+ /* Fast implementation of vector atanf based on
+    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+-   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+-   _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
++   using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps:
++   _ZGVnN4v_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1
++				 want 0x1.95ed36p-1.  */
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* Small cases, infs and nans are supported by our approximation technique,
+-     but do not set fenv flags correctly. Only trigger special case if we need
+-     fenv.  */
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+-  uint32x4_t sign = vandq_u32 (ix, SignMask);
++  uint32x4_t sign = vandq_u32 (ix, d->sign_mask);
+ 
+ #if WANT_SIMD_EXCEPT
++  /* Small cases, infs and nans are supported by our approximation technique,
++     but do not set fenv flags correctly. Only trigger special case if we need
++     fenv.  */
+   uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
+   uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
+ 				  v_u32 (BigBound - TinyBound));
+@@ -71,41 +81,52 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
+   if (__glibc_unlikely (v_any_u32 (special)))
+     return special_case (x, x, v_u32 (-1));
+ #endif
+-
+   /* Argument reduction:
+-     y := arctan(x) for x < 1
+-     y := pi/2 + arctan(-1/x) for x > 1
+-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+-  uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
+-  /* Avoid dependency in abs(x) in division (and comparison).  */
+-  float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
++     y := arctan(x) for |x| < 1
++     y := arctan(-1/x) + pi/2 for x > +1
++     y := arctan(-1/x) - pi/2 for x < -1
++     Hence, use z=-1/a if x>=|-1|, otherwise z=a.  */
++  uint32x4_t red = vcagtq_f32 (x, d->neg_one);
++
++  float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x);
++
++  /* Shift is calculated as +-pi/2 or 0, depending on the argument case.  */
+   float32x4_t shift = vreinterpretq_f32_u32 (
+-      vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
+-  /* Use absolute value only when needed (odd powers of z).  */
+-  float32x4_t az = vbslq_f32 (
+-      SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
++      vandq_u32 (red, veorq_u32 (d->pi_over_2, sign)));
++
++  float32x4_t z2 = vmulq_f32 (z, z);
++  float32x4_t z3 = vmulq_f32 (z, z2);
++  float32x4_t z4 = vmulq_f32 (z2, z2);
++#if WANT_SIMD_EXCEPT
+ 
+   /* Calculate the polynomial approximation.
+      Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+      a standard implementation using z8 creates spurious underflow
+      in the very last fma (when z^8 is small enough).
+-     Therefore, we split the last fma into a mul and an fma.
+-     Horner and single-level Estrin have higher errors that exceed
+-     threshold.  */
+-  float32x4_t z2 = vmulq_f32 (z, z);
+-  float32x4_t z4 = vmulq_f32 (z2, z2);
+-
++     Therefore, we split the last fma into a mul and an fma.  */
+   float32x4_t y = vfmaq_f32 (
+       v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
+       vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
+ 
+-  /* y = shift + z * P(z^2).  */
+-  y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
++#else
++  float32x4_t z8 = vmulq_f32 (z4, z4);
++
++  /* Uses an Estrin scheme for polynomial approximation.  */
++  float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
++
++  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, odd_coeffs, 0);
++  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, odd_coeffs, 1);
++  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, odd_coeffs, 2);
++  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, odd_coeffs, 3);
+ 
+-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+-  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
++  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
++  float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+ 
+-  return y;
++  float32x4_t y = vfmaq_f32 (p03, z8, p47);
++#endif
++
++  /* y = shift + z * P(z^2).  */
++  return vfmaq_f32 (vaddq_f32 (shift, z), z3, y);
+ }
+ libmvec_hidden_def (V_NAME_F1 (atan))
+ HALF_WIDTH_ALIAS_F1 (atan)
+diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c
+index 3a98d70c50..6558223e41 100644
+--- a/sysdeps/aarch64/fpu/atanf_sve.c
++++ b/sysdeps/aarch64/fpu/atanf_sve.c
+@@ -18,18 +18,26 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+ static const struct data
+ {
+-  float32_t poly[8];
+-  float32_t pi_over_2;
++  float32_t c1, c3, c5, c7;
++  float32_t c0, c2, c4, c6;
++  float32_t shift_val, neg_one;
+ } data = {
+   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+     [2**-128, 1.0].  */
+-  .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+-	    -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
+-  .pi_over_2 = 0x1.921fb6p+0f,
++  .c0 = -0x1.5554dcp-2,
++  .c1 = 0x1.9978ecp-3,
++  .c2 = -0x1.230a94p-3,
++  .c3 = 0x1.b4debp-4,
++  .c4 = -0x1.3550dap-4,
++  .c5 = 0x1.61eebp-5,
++  .c6 = -0x1.0c17d4p-6,
++  .c7 = 0x1.7ea694p-9,
++  /*  pi/2, used as a shift value after reduction.  */
++  .shift_val = 0x1.921fb54442d18p+0,
++  .neg_one = -1.0f,
+ };
+ 
+ #define SignMask (0x80000000)
+@@ -37,43 +45,49 @@ static const struct data
+ /* Fast implementation of SVE atanf based on
+    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+    z=-1/x and shift = pi/2.
+-   Largest observed error is 2.9 ULP, close to +/-1.0:
+-   _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1
+-				 want -0x1.967fp-1.  */
++   Largest observed error is 2.12 ULP:
++   _ZGVsMxv_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1
++				 want 0x1.95ed36p-1.  */
+ svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
++  svbool_t ptrue = svptrue_b32 ();
+ 
+   /* No need to trigger special case. Small cases, infs and nans
+      are supported by our approximation technique.  */
+   svuint32_t ix = svreinterpret_u32 (x);
+-  svuint32_t sign = svand_x (pg, ix, SignMask);
++  svuint32_t sign = svand_x (ptrue, ix, SignMask);
+ 
+   /* Argument reduction:
+      y := arctan(x) for x < 1
+-     y := pi/2 + arctan(-1/x) for x > 1
+-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+-  svbool_t red = svacgt (pg, x, 1.0f);
+-  /* Avoid dependency in abs(x) in division (and comparison).  */
+-  svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x);
+-  /* Use absolute value only when needed (odd powers of z).  */
+-  svfloat32_t az = svabs_x (pg, z);
+-  az = svneg_m (az, red, az);
+-
+-  /* Use split Estrin scheme for P(z^2) with deg(P)=7.  */
+-  svfloat32_t z2 = svmul_x (pg, z, z);
+-  svfloat32_t z4 = svmul_x (pg, z2, z2);
+-  svfloat32_t z8 = svmul_x (pg, z4, z4);
+-
+-  svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly);
+-
+-  /* y = shift + z + z^3 * P(z^2).  */
+-  svfloat32_t z3 = svmul_x (pg, z2, az);
+-  y = svmla_x (pg, az, z3, y);
+-
+-  /* Apply shift as indicated by 'red' predicate.  */
+-  y = svadd_m (red, y, sv_f32 (d->pi_over_2));
+-
+-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+-  return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
++     y := arctan(-1/x) + pi/2 for x > +1
++     y := arctan(-1/x) - pi/2 for x < -1
++     Hence, use z=-1/a if |x|>=|-1|, otherwise z=a.  */
++  svbool_t red = svacgt (pg, x, d->neg_one);
++  svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (d->neg_one), x), x);
++
++  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
++  svfloat32_t shift = svreinterpret_f32 (
++      sveor_x (red, svreinterpret_u32 (sv_f32 (d->shift_val)), sign));
++
++  svfloat32_t z2 = svmul_x (ptrue, z, z);
++  svfloat32_t z3 = svmul_x (ptrue, z2, z);
++  svfloat32_t z4 = svmul_x (ptrue, z2, z2);
++  svfloat32_t z8 = svmul_x (ptrue, z4, z4);
++
++  svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1);
++
++  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
++  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
++  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
++  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3);
++
++  svfloat32_t p03 = svmla_x (pg, p01, z4, p23);
++  svfloat32_t p47 = svmla_x (pg, p45, z4, p67);
++
++  svfloat32_t y = svmla_x (pg, p03, z8, p47);
++
++  /* shift + z + z^3 * P(z^2).  */
++  shift = svadd_m (red, z, shift);
++  return svmla_x (pg, shift, z3, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
+index 16a7cf6aa7..958d69a5f5 100644
+--- a/sysdeps/aarch64/fpu/atanh_sve.c
++++ b/sysdeps/aarch64/fpu/atanh_sve.c
+@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+ }
+ 
+ /* SVE approximation for double-precision atanh, based on log1p.
+-   The greatest observed error is 2.81 ULP:
++   The greatest observed error is 3.3 ULP:
+    _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ 				      want 0x1.ffd8ff31b501cp-6.  */
+ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
+ 
+   /* It is special if iax >= 1.  */
+-//   svbool_t special = svcmpge (pg, iax, One);
+   svbool_t special = svacge (pg, x, 1.0);
+ 
+   /* Computation is performed based on the following sequence of equality:
 diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
-index ca44053535..77e58e123e 100644
+index ca44053535..f5a163b054 100644
 --- a/sysdeps/aarch64/fpu/cosh_sve.c
 +++ b/sysdeps/aarch64/fpu/cosh_sve.c
-@@ -23,7 +23,7 @@ static const struct data
+@@ -21,69 +21,99 @@
+ 
+ static const struct data
  {
-   float64_t poly[3];
-   float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+-  float64_t poly[3];
+-  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
 -  uint64_t index_mask, special_bound;
++  double c0, c2;
++  double c1, c3;
++  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
 +  uint64_t special_bound;
  } data = {
-   .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
- 	    0x1.5555576a59599p-5, },
-@@ -35,14 +35,16 @@ static const struct data
-   .shift = 0x1.8p+52,
-   .thres = 704.0,
- 
+-  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+-	    0x1.5555576a59599p-5, },
+-
+-  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
+-  /* -ln2/N.  */
+-  .ln2_hi = -0x1.62e42fefa39efp-9,
+-  .ln2_lo = -0x1.abc9e3b39803f3p-64,
+-  .shift = 0x1.8p+52,
+-  .thres = 704.0,
+-
 -  .index_mask = 0xff,
-   /* 0x1.6p9, above which exp overflows.  */
-   .special_bound = 0x4086000000000000,
+-  /* 0x1.6p9, above which exp overflows.  */
+-  .special_bound = 0x4086000000000000,
++  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
++  .c0 = 0x1.fffffffffdbcdp-2,
++  .c1 = 0x1.555555555444cp-3,
++  .c2 = 0x1.555573c6a9f7dp-5,
++  .c3 = 0x1.1111266d28935p-7,
++  .ln2_hi = 0x1.62e42fefa3800p-1,
++  .ln2_lo = 0x1.ef35793c76730p-45,
++  /* 1/ln2.  */
++  .inv_ln2 = 0x1.71547652b82fep+0,
++  .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022.  */
++
++  /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows.  */
++  .special_bound = 0x40862e37e7d8ba72,
  };
  
- static svfloat64_t NOINLINE
+-static svfloat64_t NOINLINE
 -special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-+special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
+-{
+-  return sv_call_f64 (cosh, x, y, special);
+-}
+-
+-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
+-   special-case handling or tail.  */
++/* Helper for approximating exp(x)/2.
++   Functionally identical to FEXPA exp(x), but an adjustment in
++   the shift value which leads to a reduction in the exponent of scale by 1,
++   thus halving the result at no cost.  */
+ static inline svfloat64_t
+-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
++exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
  {
-+  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
-+  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
-+  svfloat64_t y = svadd_x (pg, half_t, half_over_t);
-   return sv_call_f64 (cosh, x, y, special);
- }
+   /* Calculate exp(x).  */
+   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
++  svuint64_t u = svreinterpret_u64 (z);
+   svfloat64_t n = svsub_x (pg, z, d->shift);
  
-@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+-  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
+-  r = svmla_x (pg, r, n, d->ln2_lo);
++  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
++  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
  
-   svuint64_t u = svreinterpret_u64 (z);
-   svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+-  svuint64_t u = svreinterpret_u64 (z);
+-  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
 -  svuint64_t i = svand_x (pg, u, d->index_mask);
-+  svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
++  svfloat64_t r = x;
++  r = svmls_lane (r, n, ln2, 0);
++  r = svmls_lane (r, n, ln2, 1);
  
-   svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
-   y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
-   y = svmla_x (pg, sv_f64 (1.0), r, y);
+-  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+-  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+-  y = svmla_x (pg, sv_f64 (1.0), r, y);
 -  y = svmul_x (pg, r, y);
-+  y = svmul_x (svptrue_b64 (), r, y);
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
++  svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
++  svfloat64_t p = svmla_x (pg, r, p04, r2);
+ 
+-  /* s = 2^(n/N).  */
+-  u = svld1_gather_index (pg, __v_exp_tail_data, i);
+-  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
++  svfloat64_t scale = svexpa (u);
+ 
+-  return svmla_x (pg, s, s, y);
++  return svmla_x (pg, scale, scale, p);
++}
++
++/* Vectorised special case to handle values past where exp_inline overflows.
++   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
++   the valid range of inputs, and returns inf for anything past that.  */
++static svfloat64_t NOINLINE
++special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
++	      const struct data *d)
++{
++  /* Finish fast path to compute values for non-special cases.  */
++  svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
++  svfloat64_t y = svadd_x (pg, t, inv_twoexp);
++
++  /* Halves input value, and then check if any cases
++     are still going to overflow.  */
++  ax = svmul_x (special, ax, 0.5);
++  svbool_t is_safe
++      = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
++
++  /* Computes exp(x/2), and sets any overflowing lanes to inf.  */
++  svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
++  half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
++
++  /* Construct special case cosh(x) = (exp(x/2)^2)/2.  */
++  svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
++  svfloat64_t special_y = svmul_x (special, exp, half_exp);
++
++  /* Select correct return values for special and non-special cases.  */
++  special_y = svsel (special, special_y, y);
++
++  /* Ensure an input of nan is correctly propagated.  */
++  svbool_t is_nan
++      = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
++  return svsel (is_nan, ax, svsel (special, special_y, y));
+ }
+ 
+ /* Approximation for SVE double-precision cosh(x) using exp_inline.
+    cosh(x) = (exp(x) + exp(-x)) / 2.
+-   The greatest observed error is in the scalar fall-back region, so is the
+-   same as the scalar routine, 1.93 ULP:
+-   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
+-				       want 0x1.fd774e958236fp+1021.
+-
+-   The greatest observed error in the non-special region is 1.54 ULP:
+-   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
+-				       want 0x1.f5e2bb8d5c991p+8.  */
++   The greatest observed error in special case region is 2.66 + 0.5 ULP:
++   _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
++				       want 0x1.f9b2d3d22399bp+1023
++
++  The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
++  _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
++				      want 0x1.890b225657f82p+3.  */
+ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -92,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+   svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
  
-   /* s = 2^(n/N).  */
-   u = svld1_gather_index (pg, __v_exp_tail_data, i);
-@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
    /* Up to the point that exp overflows, we can use it to calculate cosh by
-      exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
-   svfloat64_t t = exp_inline (ax, pg, d);
+-     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+-  svfloat64_t t = exp_inline (ax, pg, d);
 -  svfloat64_t half_t = svmul_x (pg, t, 0.5);
 -  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
++     (exp(|x|)/2 + 1) / (2 * exp(|x|)).  */
++  svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
  
-   /* Fall back to scalar for any special cases.  */
+-  /* Fall back to scalar for any special cases.  */
++  /* Falls back to entirely standalone vectorized special case.  */
    if (__glibc_unlikely (svptest_any (pg, special)))
 -    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
-+    return special_case (x, pg, t, special);
++    return special_case (pg, special, ax, half_exp, d);
  
-+  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
-+  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
-   return svadd_x (pg, half_t, half_over_t);
+-  return svadd_x (pg, half_t, half_over_t);
++  svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
++  return svadd_x (pg, half_exp, inv_twoexp);
  }
+diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
+index fb8e06cf73..8056055418 100644
+--- a/sysdeps/aarch64/fpu/coshf_sve.c
++++ b/sysdeps/aarch64/fpu/coshf_sve.c
+@@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+ }
+ 
+ /* Single-precision vector cosh, using vector expf.
+-   Maximum error is 2.77 ULP:
+-   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+-				 want 0x1.e4594cp+2.  */
++   Maximum error is 2.56 +0.5 ULP:
++   _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
++				 want 0x1.e4774ep+2.  */
+ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
 diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
 index 2743f9dbb5..b57ab514b7 100644
 --- a/sysdeps/aarch64/fpu/erfcf_sve.c
@@ -2767,25 +5078,118 @@
  
    /* Assemble result as exp10(x) = 2^n * exp10(r).  If |x| > SpecialBound
       multiplication may overflow, so use special case routine.  */
+diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
+index 1a74db265c..f3e7f8b4f6 100644
+--- a/sysdeps/aarch64/fpu/exp10f_sve.c
++++ b/sysdeps/aarch64/fpu/exp10f_sve.c
+@@ -19,26 +19,19 @@
+ 
+ #include "sv_math.h"
+ 
+-/* For x < -Thres, the result is subnormal and not handled correctly by
+-   FEXPA.  */
+-#define Thres 37.9
++/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled
++   correctly by FEXPA.  */
++#define Thres 0x1.2f702p+5
+ 
+ static const struct data
+ {
+-  float log2_10_lo, c0, c2, c4;
+-  float c1, c3, log10_2;
+-  float shift, log2_10_hi, thres;
++  float log10_2, log2_10_hi, log2_10_lo, c1;
++  float c0, shift, thres;
+ } data = {
+   /* Coefficients generated using Remez algorithm with minimisation of relative
+-     error.
+-     rel error: 0x1.89dafa3p-24
+-     abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+-     maxerr: 0.52 +0.5 ulp.  */
+-  .c0 = 0x1.26bb16p+1f,
+-  .c1 = 0x1.5350d2p+1f,
+-  .c2 = 0x1.04744ap+1f,
+-  .c3 = 0x1.2d8176p+0f,
+-  .c4 = 0x1.12b41ap-1f,
++     error.  */
++  .c0 = 0x1.26bb62p1,
++  .c1 = 0x1.53524cp1,
+   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
+   .shift = 0x1.803f8p17f,
+   .log10_2 = 0x1.a934fp+1,
+@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
+      with poly(r) in [1/sqrt(2), sqrt(2)] and
+      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
+-
+-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
++  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2);
+ 
+   /* n = round(x/(log10(2)/N)).  */
+   svfloat32_t shift = sv_f32 (d->shift);
+-  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+-  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
++  svfloat32_t z = svmla_lane (shift, x, lane_consts, 0);
++  svfloat32_t n = svsub_x (pg, z, shift);
+ 
+   /* r = x - n*log10(2)/N.  */
+-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+-  r = svmls_lane (r, n, lane_consts, 0);
++  svfloat32_t r = x;
++  r = svmls_lane (r, n, lane_consts, 1);
++  r = svmls_lane (r, n, lane_consts, 2);
+ 
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
+-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+-
++  svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3);
++  poly = svmul_x (pg, poly, r);
+   return svmla_x (pg, scale, scale, poly);
+ }
+ 
+@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
+ 		      special);
+ }
+ 
+-/* Single-precision SVE exp10f routine. Implements the same algorithm
+-   as AdvSIMD exp10f.
+-   Worst case error is 1.02 ULPs.
+-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+-				  want 0x1.ba5f9cp-1.  */
++/* Single-precision SVE exp10f routine. Based on the FEXPA instruction.
++   Worst case error is 1.10 ULP.
++   _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47
++				  want 0x1.be017p+47.  */
+ svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
 diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
-index a37c33092a..6db85266ca 100644
+index a37c33092a..c135852532 100644
 --- a/sysdeps/aarch64/fpu/exp2_sve.c
 +++ b/sysdeps/aarch64/fpu/exp2_sve.c
-@@ -18,7 +18,6 @@
+@@ -18,25 +18,22 @@
     <https://www.gnu.org/licenses/>.  */
  
  #include "sv_math.h"
 -#include "poly_sve_f64.h"
+-
+-#define N (1 << V_EXP_TABLE_BITS)
  
- #define N (1 << V_EXP_TABLE_BITS)
- 
-@@ -27,15 +26,15 @@
+ #define BigBound 1022
+ #define UOFlowBound 1280
  
  static const struct data
  {
 -  double poly[4];
-+  double c0, c2;
-+  double c1, c3;
++  double c2, c4;
++  double c0, c1, c3;
    double shift, big_bound, uoflow_bound;
  } data = {
    /* Coefficients are computed using Remez algorithm with
@@ -2794,15 +5198,20 @@
 -	    0x1.3b2abf5571ad8p-7 },
 -  .shift = 0x1.8p52 / N,
 -  .uoflow_bound = UOFlowBound,
-+  .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
-+  .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
-+  .shift = 0x1.8p52 / N,      .uoflow_bound = UOFlowBound,
-   .big_bound = BigBound,
+-  .big_bound = BigBound,
++  .c0 = 0x1.62e42fefa39efp-1,  .c1 = 0x1.ebfbdff82a31bp-3,
++  .c2 = 0x1.c6b08d706c8a5p-5,  .c3 = 0x1.3b2ad2ff7d2f3p-7,
++  .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46,
++  .uoflow_bound = UOFlowBound, .big_bound = BigBound,
  };
  
-@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+ #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+@@ -65,47 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+       svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+ 
    /* |n| > 1280 => 2^(n) overflows.  */
-   svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+-  svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
++  svbool_t p_cmp = svacle (pg, n, d->uoflow_bound);
  
 -  svfloat64_t r1 = svmul_x (pg, s1, s1);
 +  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
@@ -2810,27 +5219,140 @@
 -  svfloat64_t r0 = svmul_x (pg, r2, s1);
 +  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
  
-   return svsel (p_cmp, r1, r0);
+-  return svsel (p_cmp, r1, r0);
++  return svsel (p_cmp, r0, r1);
  }
-@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
-   svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
-   svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
  
-+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+ /* Fast vector implementation of exp2.
+-   Maximum measured error is 1.65 ulp.
+-   _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
+-				       want 0x1.f8db0d4df721dp-1.  */
++   Maximum measured error is 0.52 + 0.5 ulp.
++   _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0
++				       want 0x1.8861641b49e07p+0.  */
+ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  svbool_t no_big_scale = svacle (pg, x, d->big_bound);
+-  svbool_t special = svnot_z (pg, no_big_scale);
+-
+-  /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N].  */
+-  svfloat64_t shift = sv_f64 (d->shift);
+-  svfloat64_t kd = svadd_x (pg, x, shift);
+-  svuint64_t ki = svreinterpret_u64 (kd);
+-  /* kd = k/N.  */
+-  kd = svsub_x (pg, kd, shift);
+-  svfloat64_t r = svsub_x (pg, x, kd);
+-
+-  /* scale ~= 2^(k/N).  */
+-  svuint64_t idx = svand_x (pg, ki, N - 1);
+-  svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
+-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+-  svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
+-  svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
++  svbool_t special = svacge (pg, x, d->big_bound);
++
++  svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift);
++  svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift);
++  svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
++
++  svfloat64_t scale = svexpa (svreinterpret_u64 (z));
++
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+ 
    /* Approximate exp2(r) using polynomial.  */
 -  svfloat64_t r2 = svmul_x (pg, r, r);
 -  svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
 -  svfloat64_t y = svmul_x (pg, r, p);
--
-+  /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
-+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
-+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
-+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
-+  svfloat64_t p = svmla_x (pg, p01, p23, r2);
++  /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4).  */
++  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
++  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
++  svfloat64_t p = svmla_x (pg, p12, p34, r2);
++  p = svmad_x (pg, p, r, d->c0);
 +  svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
+ 
    /* Assemble exp2(x) = exp2(r) * scale.  */
    if (__glibc_unlikely (svptest_any (pg, special)))
-     return special_case (pg, scale, y, kd, d);
+-    return special_case (pg, scale, y, kd, d);
++    {
++      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
++          special case function so needs to be copied.
++          e = sign bit of u << 46.  */
++      svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46),
++            0x8000000000000000);
++      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
++      return special_case (pg, scale, y, n, d);
++    }
++
+   return svmla_x (pg, scale, scale, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
+index fcd7830164..989cefb605 100644
+--- a/sysdeps/aarch64/fpu/exp2f_sve.c
++++ b/sysdeps/aarch64/fpu/exp2f_sve.c
+@@ -18,21 +18,17 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+ #define Thres 0x1.5d5e2ap+6f
+ 
+ static const struct data
+ {
+-  float c0, c2, c4, c1, c3;
+-  float shift, thres;
++  float c0, c1, shift, thres;
+ } data = {
+-  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
+-  .c0 = 0x1.62e422p-1f,
+-  .c1 = 0x1.ebf9bcp-3f,
+-  .c2 = 0x1.c6bd32p-5f,
+-  .c3 = 0x1.3ce9e4p-7f,
+-  .c4 = 0x1.59977ap-10f,
++  /* Coefficients generated using Remez algorithm with minimisation of relative
++     error.  */
++  .c0 = 0x1.62e485p-1,
++  .c1 = 0x1.ebfbe0p-3,
+   /* 1.5*2^17 + 127.  */
+   .shift = 0x1.803f8p17f,
+   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+ 
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+-  /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
+-     Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
+-     coefficients 1 to 4, and apply most significant coefficient directly.  */
+-  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+-  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+-  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
++  svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1));
++  poly = svmul_x (svptrue_b32 (), poly, r);
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
+@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
+ 		      special);
+ }
+ 
+-/* Single-precision SVE exp2f routine. Implements the same algorithm
+-   as AdvSIMD exp2f.
+-   Worst case error is 1.04 ULPs.
+-   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+-				 want 0x1.ba6a64p-1.  */
++/* Single-precision SVE exp2f routine, based on the FEXPA instruction.
++   Worst case error is 1.09 ULPs.
++   _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0
++				 want 0x1.be1052p+0.  */
+ svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
 diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
 index 37de751f90..dc049482ed 100644
 --- a/sysdeps/aarch64/fpu/exp_sve.c
@@ -2913,6 +5435,377 @@
    svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
    svfloat64_t y = svmla_x (pg, r, p04, r2);
  
+diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
+index f9249db8b6..c3619975b3 100644
+--- a/sysdeps/aarch64/fpu/expf_sve.c
++++ b/sysdeps/aarch64/fpu/expf_sve.c
+@@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
+ }
+ 
+ /* Optimised single-precision SVE exp function.
+-   Worst-case error is 1.04 ulp:
+-   SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+-				  want 0x1.ba74bap+4.  */
++   Worst-case error is 0.88 +0.50 ULP:
++   _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1
++				want 0x1.f2528ap-1.  */
+ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
+index d4ba8ccf3b..b1d940bd20 100644
+--- a/sysdeps/aarch64/fpu/expm1_sve.c
++++ b/sysdeps/aarch64/fpu/expm1_sve.c
+@@ -18,82 +18,164 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+-#define SpecialBound 0x1.62b7d369a5aa9p+9
+-#define ExponentBias 0x3ff0000000000000
++#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64.  */
++#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)).  */
+ 
+ static const struct data
+ {
+-  double poly[11];
+-  double shift, inv_ln2, special_bound;
+-  /* To be loaded in one quad-word.  */
++  double c2, c4;
++  double inv_ln2;
+   double ln2_hi, ln2_lo;
++  double c0, c1, c3;
++  double shift, thres;
++  uint64_t expm1_data[32];
+ } data = {
+-  /* Generated using fpminimax.  */
+-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+-            0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
+-            0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+-            0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+-
+-  .special_bound = SpecialBound,
+-  .inv_ln2 = 0x1.71547652b82fep0,
+-  .ln2_hi = 0x1.62e42fefa39efp-1,
+-  .ln2_lo = 0x1.abc9e3b39803fp-56,
+-  .shift = 0x1.8p52,
++  /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
++  The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
++  The first half of the table stores values associated to i from 0 to 15.
++  The second half of the table stores values associated to i from 0 to -15.  */
++  .expm1_data = {
++      0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
++      0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
++      0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
++		  0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
++      0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7,
++      0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
++		  0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e,
++      0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
++    },
++
++  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
++  .c0 = 0x1p-1,
++  .c1 = 0x1.55555555548f9p-3,
++  .c2 = 0x1.5555555554c22p-5,
++  .c3 = 0x1.111123aaa2fb2p-7,
++  .c4 = 0x1.6c16d77d98e5bp-10,
++  .ln2_hi = 0x1.62e42fefa3800p-1,
++  .ln2_lo = 0x1.ef35793c76730p-45,
++  .inv_ln2 = 0x1.71547652b82fep+0,
++  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
++  .thres = SpecialBound,
+ };
+ 
+-static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
++#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
++/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
++#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
++#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
++
++static NOINLINE svfloat64_t
++special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
++	      svfloat64_t n)
+ {
+-  return sv_call_f64 (expm1, x, y, pg);
++  /* s=2^n may overflow, break it up into s=s1*s2,
++     such that exp = s + s*y can be computed as s1*(s2+s2*y)
++     and s1*s1 overflows only if n>0.  */
++
++  /* If n<=0 then set b to 0x6, 0 otherwise.  */
++  svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0.  */
++  svuint64_t b
++      = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
++
++  /* Set s1 to generate overflow depending on sign of exponent n,
++     ie. s1 = 0x70...0 - b.  */
++  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
++  /* Offset s to avoid overflow in final result if n is below threshold.
++     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
++  svfloat64_t s2 = svreinterpret_f64 (
++      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
++
++  /* |n| > 1280 => 2^(n) overflows.  */
++  svbool_t p_cmp = svacgt (pg, n, 1280.0);
++
++  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
++  svfloat64_t r2 = svmla_x (pg, s2, s2, p);
++  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
++
++  svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes.  */
++  return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
+ }
+ 
+-/* Double-precision vector exp(x) - 1 function.
+-   The maximum error observed error is 2.18 ULP:
+-   _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+-				       want 0x1.a8b9ea8d66e2p-2.  */
++/* FEXPA based SVE expm1 algorithm.
++   Maximum measured error is 2.81 + 0.5 ULP:
++   _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3
++				       want 0x1.c290e5858bb5p-3.  */
+ svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* Large, Nan/Inf.  */
+-  svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
+-
+-  /* Reduce argument to smaller range:
+-     Let i = round(x / ln2)
+-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where 2^i is exact because i is an integer.  */
+-  svfloat64_t shift = sv_f64 (d->shift);
+-  svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
+-  svint64_t i = svcvt_s64_x (pg, n);
+-  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+-  svfloat64_t f = svmls_lane (x, n, ln2, 0);
+-  f = svmls_lane (f, n, ln2, 1);
+-
+-  /* Approximate expm1(f) using polynomial.
+-     Taylor expansion for expm1(x) has the form:
+-	 x + ax^2 + bx^3 + cx^4 ....
+-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t f4 = svmul_x (pg, f2, f2);
+-  svfloat64_t f8 = svmul_x (pg, f4, f4);
+-  svfloat64_t p
+-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+-
+-  /* Assemble the result.
+-   expm1(x) ~= 2^i * (p + 1) - 1
+-   Let t = 2^i.  */
+-  svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
+-  svfloat64_t t = svreinterpret_f64 (u);
+-
+-  /* expm1(x) ~= p * t + (t - 1).  */
+-  svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
++  svbool_t special = svacgt (pg, x, d->thres);
+ 
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, y, special);
++  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
++  svuint64_t u = svreinterpret_u64 (z);
++  svfloat64_t n = svsub_x (pg, z, d->shift);
+ 
++  /* r = x - n * ln2, r is in [-ln2/128, ln2/128].  */
++  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
++  svfloat64_t r = x;
++  r = svmls_lane (r, n, ln2, 0);
++  r = svmls_lane (r, n, ln2, 1);
++
++  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
++
++  svfloat64_t p;
++  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
++  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
++  p = svmad_x (pg, c34, r2, c12);
++  p = svmad_x (pg, p, r, sv_f64 (d->c0));
++  p = svmad_x (pg, p, r2, r);
++
++  svfloat64_t scale = svexpa (u);
++  svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
++
++  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
++     However, for values of scale close to 1, scale-1 causes large ULP errors
++     due to cancellation.
++
++     This can be circumvented by using a small lookup for scale-1
++     when our input is below a certain bound, otherwise we can use FEXPA.
++
++     This bound is based upon the table size:
++	   Bound = (TableSize-1/64) * ln2.
++     The current bound is based upon a table size of 16.  */
++  svbool_t is_small = svaclt (pg, x, FexpaBound);
++
++  if (svptest_any (pg, is_small))
++    {
++      /* Index via the input of FEXPA, but we only care about the lower 4 bits.
++       */
++      svuint64_t base_idx = svand_x (pg, u, 0xf);
++
++      /* We can use the sign of x as a fifth bit to account for the asymmetry
++	 of e^x around 0.  */
++      svuint64_t signBit
++	  = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
++      svuint64_t idx = svorr_x (pg, base_idx, signBit);
++
++      /* Lookup values for scale - 1 for small x.  */
++      svfloat64_t lookup = svreinterpret_f64 (
++	  svld1_gather_index (is_small, d->expm1_data, idx));
++
++      /* Select the appropriate scale - 1 value based on x.  */
++      scalem1 = svsel (is_small, lookup, scalem1);
++    }
++
++  svfloat64_t y = svmla_x (pg, scalem1, scale, p);
++
++  /* FEXPA returns nan for large inputs so we special case those.  */
++  if (__glibc_unlikely (svptest_any (pg, special)))
++    {
++      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
++          special case function so needs to be copied.
++          e = sign bit of u << 46.  */
++      svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
++      /* Copy sign to s.  */
++      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
++      return special_case (pg, y, scale, p, n);
++    }
++
++  /* return expm1 = (scale - 1) + (scale * poly).  */
+   return y;
+ }
+diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
+index 862c13f811..821c0780a9 100644
+--- a/sysdeps/aarch64/fpu/log1p_sve.c
++++ b/sysdeps/aarch64/fpu/log1p_sve.c
+@@ -22,19 +22,33 @@
+ 
+ static const struct data
+ {
+-  double poly[19];
++  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
++  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+   double ln2_hi, ln2_lo;
+   uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
+ } data = {
+   /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20
+-     polynomial, however first 2 coefficients are 0 and 1 so are not stored.  */
+-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+-	    -0x1.cfa7385bdb37ep-6, },
++     polynomial, however first 2 coefficients are 0 and 1 so are not
++     stored.  */
++  .c0 = -0x1.ffffffffffffbp-2,
++  .c1 = 0x1.55555555551a9p-2,
++  .c2 = -0x1.00000000008e3p-2,
++  .c3 = 0x1.9999999a32797p-3,
++  .c4 = -0x1.555555552fecfp-3,
++  .c5 = 0x1.249248e071e5ap-3,
++  .c6 = -0x1.ffffff8bf8482p-4,
++  .c7 = 0x1.c71c8f07da57ap-4,
++  .c8 = -0x1.9999ca4ccb617p-4,
++  .c9 = 0x1.7459ad2e1dfa3p-4,
++  .c10 = -0x1.554d2680a3ff2p-4,
++  .c11 = 0x1.3b4c54d487455p-4,
++  .c12 = -0x1.2548a9ffe80e6p-4,
++  .c13 = 0x1.0f389a24b2e07p-4,
++  .c14 = -0x1.eee4db15db335p-5,
++  .c15 = 0x1.e95b494d4a5ddp-5,
++  .c16 = -0x1.15fdf07cb7c73p-4,
++  .c17 = 0x1.0310b70800fcfp-4,
++  .c18 = -0x1.cfa7385bdb37ep-6,
+   .ln2_hi = 0x1.62e42fefa3800p-1,
+   .ln2_lo = 0x1.ef35793c76730p-45,
+   /* top32(asuint64(sqrt(2)/2)) << 32.  */
+@@ -49,7 +63,7 @@ static const struct data
+ #define BottomMask 0xffffffff
+ 
+ static svfloat64_t NOINLINE
+-special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
++special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+ {
+   return sv_call_f64 (log1p, x, y, special);
+ }
+@@ -91,8 +105,9 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
+   /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+   svuint64_t utop
+       = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top);
+-  svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
+-  svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
++  svuint64_t u_red
++      = svorr_x (pg, utop, svand_x (svptrue_b64 (), mi, BottomMask));
++  svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1);
+ 
+   /* Correction term c/m.  */
+   svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m);
+@@ -103,18 +118,49 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
+      Hence approximation has the form f + f^2 * P(f)
+      where P(x) = C0 + C1*x + C2x^2 + ...
+      Assembling this all correctly is dealt with at the final step.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
+-	      f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
+-  svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly);
++  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
++	      f4 = svmul_x (svptrue_b64 (), f2, f2),
++	      f8 = svmul_x (svptrue_b64 (), f4, f4),
++	      f16 = svmul_x (svptrue_b64 (), f8, f8);
++
++  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
++  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
++  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
++  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
++  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
++
++  /* Order-18 Estrin scheme.  */
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
++  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
++
++  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
++  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
++  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
++
++  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
++  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
++  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
++  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
++
++  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
++  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
++  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
++
++  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
++  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
++  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
++  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
+ 
+   svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
+   svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
+-  svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (special, x, y);
+-
+-  return y;
++    return special_case (
++	x, svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p),
++	special);
++  return svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p);
+ }
+ 
+ strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
 diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
 index 42d551ca92..b8c1b39dca 100644
 --- a/sysdeps/aarch64/fpu/pow_sve.c
@@ -3533,25 +6426,601 @@
  
    return ret;
  }
+diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
+index 963453f812..072ba8fca9 100644
+--- a/sysdeps/aarch64/fpu/sinh_sve.c
++++ b/sysdeps/aarch64/fpu/sinh_sve.c
+@@ -18,90 +18,153 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[11];
+-  float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
+   uint64_t halff;
+-  int64_t onef;
+-  uint64_t large_bound;
++  double c2, c4;
++  double inv_ln2;
++  double ln2_hi, ln2_lo;
++  double c0, c1, c3;
++  double shift, special_bound, bound;
++  uint64_t expm1_data[20];
+ } data = {
+-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+-
+-  .inv_ln2 = 0x1.71547652b82fep0,
+-  .m_ln2_hi = -0x1.62e42fefa39efp-1,
+-  .m_ln2_lo = -0x1.abc9e3b39803fp-56,
+-  .shift = 0x1.8p52,
+-
++  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
++  .expm1_data = {
++    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
++    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
++    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
++    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
++    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
++  },
++
++  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
++  .c0 = 0x1p-1,
++  .c1 = 0x1.55555555548f9p-3,
++  .c2 = 0x1.5555555554c22p-5,
++  .c3 = 0x1.111123aaa2fb2p-7,
++  .c4 = 0x1.6c16d77d98e5bp-10,
++  .ln2_hi = 0x1.62e42fefa3800p-1,
++  .ln2_lo = 0x1.ef35793c76730p-45,
++  .inv_ln2 = 0x1.71547652b82fep+0,
++  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
+   .halff = 0x3fe0000000000000,
+-  .onef = 0x3ff0000000000000,
+-  /* 2^9. expm1 helper overflows for large input.  */
+-  .large_bound = 0x4080000000000000,
++  .special_bound = 0x1.62e37e7d8ba72p+9,	/* ln(2^(1024 - 1/128)).  */
++  .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64.  */
+ };
+ 
++/* A specialised FEXPA expm1 that is only valid for positive inputs and
++   has no special cases. Based off the full FEXPA expm1 implementated for
++   _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP.  */
+ static inline svfloat64_t
+-expm1_inline (svfloat64_t x, svbool_t pg)
++expm1_inline (svbool_t pg, svfloat64_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* Reduce argument:
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where i = round(x / ln2)
+-     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+-  svfloat64_t j
+-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+-  svint64_t i = svcvt_s64_x (pg, j);
+-  svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
+-  f = svmla_x (pg, f, j, d->m_ln2_lo);
+-  /* Approximate expm1(f) using polynomial.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t f4 = svmul_x (pg, f2, f2);
+-  svfloat64_t f8 = svmul_x (pg, f4, f4);
+-  svfloat64_t p
+-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+-  /* t = 2^i.  */
+-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+-  /* expm1(x) ~= p * t + (t - 1).  */
+-  return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
++  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
++  svuint64_t u = svreinterpret_u64 (z);
++  svfloat64_t n = svsub_x (pg, z, d->shift);
++
++  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
++  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
++
++  svfloat64_t r = x;
++  r = svmls_lane (r, n, ln2, 0);
++  r = svmls_lane (r, n, ln2, 1);
++
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++
++  svfloat64_t p;
++  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
++  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
++  p = svmad_x (pg, c34, r2, c12);
++  p = svmad_x (pg, p, r, sv_f64 (d->c0));
++  p = svmad_x (pg, p, r2, r);
++
++  svfloat64_t scale = svexpa (u);
++
++  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
++     However, for values of scale close to 1, scale-1 causes large ULP errors
++     due to cancellation.
++
++     This can be circumvented by using a small lookup for scale-1
++     when our input is below a certain bound, otherwise we can use FEXPA.  */
++  svbool_t is_small = svaclt (pg, x, d->bound);
++
++  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
++  svuint64_t base_idx = svand_x (pg, u, 0x1f);
++
++  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
++  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
++  svuint64_t scalem1_lookup
++      = svld1_gather_index (is_small, d->expm1_data, base_idx);
++
++  /* Select the appropriate scale - 1 value based on x.  */
++  svfloat64_t scalem1
++      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
++
++  /* return expm1 = scale - 1 + (scale * poly).  */
++  return svmla_x (pg, scalem1, scale, p);
+ }
+ 
++/* Vectorised special case to handle values past where exp_inline overflows.
++   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
++   the valid range of inputs, and returns inf for anything past that.  */
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svbool_t pg)
++special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
++	      svfloat64_t halfsign, const struct data *d)
+ {
+-  return sv_call_f64 (sinh, x, x, pg);
++  /* Halves input value, and then check if any cases
++     are still going to overflow.  */
++  ax = svmul_x (special, ax, 0.5);
++  svbool_t is_safe = svaclt (special, ax, d->special_bound);
++
++  svfloat64_t t = expm1_inline (pg, ax);
++
++  /* Finish fastpass to compute values for non-special cases.  */
++  svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
++  y = svmul_x (pg, y, halfsign);
++
++  /* Computes special lane, and set remaining overflow lanes to inf.  */
++  svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
++  svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
++
++  svuint64_t signed_inf
++      = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
++		 sv_u64 (0x7ff0000000000000));
++  special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
++
++  /* Join resulting vectors together and return.  */
++  return svsel (special, special_y, y);
+ }
+ 
+-/* Approximation for SVE double-precision sinh(x) using expm1.
+-   sinh(x) = (exp(x) - exp(-x)) / 2.
+-   The greatest observed error is 2.57 ULP:
+-   _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
+-				       want 0x1.ab929fc64bd63p-2.  */
++/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
++   Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
++   The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
++   _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
++				       want 0x1.c3587faf97b09p-2
++
++   The greatest observed error in the special region is 2.65 + 0.5 ULP:
++   _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
++				       want 0x1.fffd30eea0063p+1023.  */
+ svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
++  svbool_t special = svacge (pg, x, d->special_bound);
+   svfloat64_t ax = svabs_x (pg, x);
+   svuint64_t sign
+       = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
+ 
+-  svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
+-
+   /* Fall back to scalar variant for all lanes if any are special.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, pg);
++    return special_case (pg, special, ax, halfsign, d);
+ 
+   /* Up to the point that expm1 overflows, we can use it to calculate sinh
+      using a slight rearrangement of the definition of sinh. This allows us to
+      retain acceptable accuracy for very small inputs.  */
+-  svfloat64_t t = expm1_inline (ax, pg);
++  svfloat64_t t = expm1_inline (pg, ax);
+   t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+   return svmul_x (pg, t, halfsign);
+ }
 diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
-index f208d33896..16b81fc738 100644
+index f208d33896..e2d2e906bd 100644
 --- a/sysdeps/aarch64/fpu/sv_expf_inline.h
 +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
-@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+@@ -24,52 +24,41 @@
+ 
+ struct sv_expf_data
+ {
+-  float c1, c3, inv_ln2;
+-  float ln2_lo, c0, c2, c4;
+-  float ln2_hi, shift;
++  float ln2_hi, ln2_lo, c1, null;
++  float inv_ln2, shift;
+ };
+ 
+-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+-   compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
++/* Shift is 1.5*2^17 + 127.  */
+ #define SV_EXPF_DATA                                                          \
+   {                                                                           \
+-    /* Coefficients copied from the polynomial in AdvSIMD variant.  */        \
+-    .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f,         \
+-    .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f,    \
+-    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+-    .shift = 0x1.803f8p17f,                                                   \
++    .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,            \
++    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
+   }
+ 
+-#define C(i) sv_f32 (d->poly[i])
+-
+ static inline svfloat32_t
+ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+ {
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+ 
+-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
++  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi);
+ 
+   /* n = round(x/(ln2/N)).  */
+   svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
+   svfloat32_t n = svsub_x (pg, z, d->shift);
+ 
+   /* r = x - n*ln2/N.  */
+-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
++  svfloat32_t r = x;
+   r = svmls_lane (r, n, lane_consts, 0);
++  r = svmls_lane (r, n, lane_consts, 1);
+ 
    /* scale = 2^(n/N).  */
    svfloat32_t scale = svexpa (svreinterpret_u32 (z));
  
 -  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
-+  /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5.  */
-   svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
-   svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
++  /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2.  */
    svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
-@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
++  svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2);
  
    return svmla_x (pg, scale, scale, poly);
  }
 -
  #endif
+diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
+index 71f88e02de..c2b196f35b 100644
+--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h
++++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
+@@ -21,11 +21,12 @@
+ #define AARCH64_FPU_SV_LOG1P_INLINE_H
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct sv_log1p_data
+ {
+-  double poly[19], ln2[2];
++  double c0, c2, c4, c6, c8, c10, c12, c14, c16;
++  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
++  double ln2_lo, ln2_hi;
+   uint64_t hf_rt2_top;
+   uint64_t one_m_hf_rt2_top;
+   uint32_t bottom_mask;
+@@ -33,15 +34,30 @@ static const struct sv_log1p_data
+ } sv_log1p_data = {
+   /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
+    */
+-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+-	    -0x1.cfa7385bdb37ep-6 },
+-  .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
++  .c0 = -0x1.ffffffffffffbp-2,
++  .c1 = 0x1.55555555551a9p-2,
++  .c2 = -0x1.00000000008e3p-2,
++  .c3 = 0x1.9999999a32797p-3,
++  .c4 = -0x1.555555552fecfp-3,
++  .c5 = 0x1.249248e071e5ap-3,
++  .c6 = -0x1.ffffff8bf8482p-4,
++  .c7 = 0x1.c71c8f07da57ap-4,
++  .c8 = -0x1.9999ca4ccb617p-4,
++  .c9 = 0x1.7459ad2e1dfa3p-4,
++  .c10 = -0x1.554d2680a3ff2p-4,
++  .c11 = 0x1.3b4c54d487455p-4,
++  .c12 = -0x1.2548a9ffe80e6p-4,
++  .c13 = 0x1.0f389a24b2e07p-4,
++  .c14 = -0x1.eee4db15db335p-5,
++  .c15 = 0x1.e95b494d4a5ddp-5,
++  .c16 = -0x1.15fdf07cb7c73p-4,
++  .c17 = 0x1.0310b70800fcfp-4,
++  .c18 = -0x1.cfa7385bdb37ep-6,
++  .ln2_lo = 0x1.62e42fefa3800p-1,
++  .ln2_hi = 0x1.ef35793c76730p-45,
++  /* top32(asuint64(sqrt(2)/2)) << 32.  */
+   .hf_rt2_top = 0x3fe6a09e00000000,
++  /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32.  */
+   .one_m_hf_rt2_top = 0x00095f6200000000,
+   .bottom_mask = 0xffffffff,
+   .one_top = 0x3ff
+@@ -51,14 +67,14 @@ static inline svfloat64_t
+ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+ {
+   /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
+-     differs from v_log1p_2u5.c by:
++     differs from advsimd/log1p.c by:
+      - No special-case handling - this should be dealt with by the caller.
+      - Pairwise Horner polynomial evaluation for improved accuracy.
+      - Optionally simulate the shortcut for k=0, used in the scalar routine,
+        using svsel, for improved accuracy when the argument to log1p is close
+      to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
+      in the source of the caller before including this file.
+-     See sv_log1p_2u1.c for details of the algorithm.  */
++     See sve/log1p.c for details of the algorithm.  */
+   const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
+   svfloat64_t m = svadd_x (pg, x, 1);
+   svuint64_t mi = svreinterpret_u64 (m);
+@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+   svfloat64_t cm;
+ 
+ #ifndef WANT_SV_LOG1P_K0_SHORTCUT
+-#error                                                                         \
++#error                                                                       \
+   "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+ #elif WANT_SV_LOG1P_K0_SHORTCUT
+   /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+ #endif
+ 
+   /* Approximate log1p(f) on the reduced input using a polynomial.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
++  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
++	      f4 = svmul_x (svptrue_b64 (), f2, f2),
++	      f8 = svmul_x (svptrue_b64 (), f4, f4),
++	      f16 = svmul_x (svptrue_b64 (), f8, f8);
++
++  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
++  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
++  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
++  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
++  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
++
++  /* Order-18 Estrin scheme.  */
++  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
++  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
++  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
++  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
++
++  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
++  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
++  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
++
++  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
++  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
++  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
++  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
++
++  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
++  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
++  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
++
++  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
++  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
++  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
++  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
+ 
+   /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+-  svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
+-  svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
++  svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
++  svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
++  svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
+ 
+-  return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
++  return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
+ }
+-
+ #endif
+diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
+index 789cc6854f..5869419010 100644
+--- a/sysdeps/aarch64/fpu/tanh_sve.c
++++ b/sysdeps/aarch64/fpu/tanh_sve.c
+@@ -18,83 +18,117 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[11];
+-  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+-  uint64_t thresh, tiny_bound;
++  double ln2_hi, ln2_lo;
++  double c2, c4;
++  double c0, c1, c3;
++  double two_over_ln2, shift;
++  uint64_t tiny_bound;
++  double large_bound, fexpa_bound;
++  uint64_t e2xm1_data[20];
+ } data = {
+-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+-
+-  .inv_ln2 = 0x1.71547652b82fep0,
+-  .ln2_hi = -0x1.62e42fefa39efp-1,
+-  .ln2_lo = -0x1.abc9e3b39803fp-56,
+-  .shift = 0x1.8p52,
+-
++  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
++  .c0 = 0x1p-1,
++  .c1 = 0x1.55555555548f9p-3,
++  .c2 = 0x1.5555555554c22p-5,
++  .c3 = 0x1.111123aaa2fb2p-7,
++  .c4 = 0x1.6c16d77d98e5bp-10,
++  .ln2_hi = 0x1.62e42fefa3800p-1,
++  .ln2_lo = 0x1.ef35793c76730p-45,
++  .two_over_ln2 = 0x1.71547652b82fep+1,
++  .shift = 0x1.800000000ffc0p+46,   /* 1.5*2^46+1023.  */
+   .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27).  */
+-  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+-  .thresh = 0x01f241bf835f9d5f,
++  .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54).  */
++  .fexpa_bound = 0x1.a56ef8ec924ccp-4,	  /* 19/64 * ln2/2.  */
++  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
++  .e2xm1_data = {
++    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
++    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
++    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
++    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
++    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
++  },
+ };
+ 
++/* An expm1 inspired, FEXPA based helper function that returns an
++   accurate estimate for e^2x - 1. With no special case or support for
++   negative inputs of x.  */
+ static inline svfloat64_t
+-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+-{
+-  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+-     the scalar variant of tanh.  */
+-
+-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+-  svfloat64_t j
+-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+-  svint64_t i = svcvt_s64_x (pg, j);
+-  svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
+-  f = svmla_x (pg, f, j, d->ln2_lo);
+-
+-  /* Approximate expm1(f) using polynomial.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t f4 = svmul_x (pg, f2, f2);
+-  svfloat64_t p = svmla_x (
+-      pg, f, f2,
+-      sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
+-
+-  /* t = 2 ^ i.  */
+-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+-  /* expm1(x) = p * t + (t - 1).  */
+-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+-}
+-
+-static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
++e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
+ {
+-  return sv_call_f64 (tanh, x, y, special);
++  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
++  svuint64_t u = svreinterpret_u64 (z);
++  svfloat64_t n = svsub_x (pg, z, d->shift);
++
++  /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)].  */
++  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
++  svfloat64_t r = svadd_x (pg, x, x);
++  r = svmls_lane (r, n, ln2, 0);
++  r = svmls_lane (r, n, ln2, 1);
++
++  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
++  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
++  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
++
++  svfloat64_t p;
++  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
++  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
++  p = svmad_x (pg, c34, r2, c12);
++  p = svmad_x (pg, p, r, sv_f64 (d->c0));
++  p = svmad_x (pg, p, r2, r);
++
++  svfloat64_t scale = svexpa (u);
++
++  /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
++     However, for values of scale close to 1, scale-1 causes large ULP errors
++     due to cancellation.
++
++     This can be circumvented by using a small lookup for scale-1
++     when our input is below a certain bound, otherwise we can use FEXPA.  */
++  svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
++
++  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
++  svuint64_t base_idx = svand_x (pg, u, 0x1f);
++
++  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
++  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
++  svuint64_t scalem1_lookup
++      = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
++
++  /* Select the appropriate scale - 1 value based on x.  */
++  svfloat64_t scalem1
++      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
++  return svmla_x (pg, scalem1, scale, p);
+ }
+ 
+-/* SVE approximation for double-precision tanh(x), using a simplified
+-   version of expm1. The greatest observed error is 2.77 ULP:
+-   _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+-				       want -0x1.bd6a21a163624p-3.  */
++/* SVE approximation for double-precision tanh(x), using a modified version of
++   FEXPA expm1 to calculate e^2x - 1.
++   The greatest observed error is 2.79 + 0.5 ULP:
++   _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
++				       want 0x1.fff7be486cae9p-9.  */
+ svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
++  svbool_t large = svacge (pg, x, d->large_bound);
+ 
+-  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+-  svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
++  /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
++  As an additional optimisation, we can ensure more accurate values of e^x
++  by only using positive inputs. So we calculate tanh(|x|), and restore the
++  sign of the input before returning.  */
++  svfloat64_t ax = svabs_x (pg, x);
++  svuint64_t sign_bit
++      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+ 
+-  svfloat64_t u = svadd_x (pg, x, x);
++  svfloat64_t p = e2xm1_inline (pg, ax, d);
++  svfloat64_t q = svadd_x (pg, p, 2);
+ 
+-  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+-  svfloat64_t q = expm1_inline (u, pg, d);
+-  svfloat64_t qp2 = svadd_x (pg, q, 2);
++  /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
++     rounded, at this point we can return 1 directly, with sign correction.
++     This will also act as a guard against our approximation overflowing.  */
++  svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
+ 
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svdiv_x (pg, q, qp2), special);
+-  return svdiv_x (pg, q, qp2);
++  return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
+ }
 diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
 index 772b16a358..1c3c392513 100644
 --- a/sysdeps/aarch64/multiarch/Makefile
@@ -3727,6 +7196,27 @@
 +
 +END (__memset_sve_zva64)
 +#endif
+diff --git a/sysdeps/arm/find_exidx.c b/sysdeps/arm/find_exidx.c
+index 60021a072c..468e016214 100644
+--- a/sysdeps/arm/find_exidx.c
++++ b/sysdeps/arm/find_exidx.c
+@@ -15,6 +15,7 @@
+    License along with the GNU C Library.  If not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
++#include <ldsodefs.h>
+ #include <link.h>
+ 
+ /* Find the exception index table containing PC.  */
+@@ -23,7 +24,7 @@ _Unwind_Ptr
+ __gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount)
+ {
+   struct dl_find_object data;
+-  if (__dl_find_object ((void *) pc, &data) < 0)
++  if (GLRO(dl_find_object) ((void *) pc, &data) < 0)
+     return 0;
+   *pcount = data.dlfo_eh_count;
+   return (_Unwind_Ptr) data.dlfo_eh_frame;
 diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
 index e871f27ff2..ddb34a1588 100644
 --- a/sysdeps/generic/ldsodefs.h
@@ -5474,6 +8964,18 @@
  
  ifeq ($(subdir),stdlib)
  gen-as-const-headers += ucontext_i.sym
+diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+index 6d63c8a9ec..1acc82d077 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
++++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+@@ -23,6 +23,7 @@
+ #include <sys/prctl.h>
+ #include <sys/utsname.h>
+ #include <dl-tunables-parse.h>
++#include <dl-symbol-redir-ifunc.h>
+ 
+ #define DCZID_DZP_MASK (1 << 4)
+ #define DCZID_BS_MASK (0xf)
 diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c b/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c
 index 3ff33ef72a..c884efc3b4 100644
 --- a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c