Version in base suite: 2.41-12 Base version: glibc_2.41-12 Target version: glibc_2.41-12+deb13u1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/g/glibc/glibc_2.41-12.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/g/glibc/glibc_2.41-12+deb13u1.dsc changelog | 16 patches/git-updates.diff | 3670 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 3602 insertions(+), 84 deletions(-) diff -Nru glibc-2.41/debian/changelog glibc-2.41/debian/changelog --- glibc-2.41/debian/changelog 2025-08-05 17:34:49.000000000 +0000 +++ glibc-2.41/debian/changelog 2025-12-28 16:32:33.000000000 +0000 @@ -1,3 +1,19 @@ +glibc (2.41-12+deb13u1) trixie; urgency=medium + + * debian/patches/git-updates.diff: update from upstream stable branch: + - Fix a double lock init issue after fork() + - Fix _dl_find_object when ld.so has LOAD segment gaps, causing wrong + backtrace unwinding. This affects at least arm64. + - Fix SYSCALL_CANCEL for return values larger than INT_MAX (closes: + #1115729). + - Fix crash in ifunc functions on arm64 when hardening with + -ftrivial-auto-var-init=zero. + - Optimize inverse trig functions on arm64. + - Optimize arm64 SVE exp, hyperbolic, and log1p functions. + - Optimize arm64 SVE expf and log1p helpers. + + -- Aurelien Jarno Sun, 28 Dec 2025 17:32:33 +0100 + glibc (2.41-12) unstable; urgency=medium [ Helmut Grohne ] diff -Nru glibc-2.41/debian/patches/git-updates.diff glibc-2.41/debian/patches/git-updates.diff --- glibc-2.41/debian/patches/git-updates.diff 2025-08-05 16:54:19.000000000 +0000 +++ glibc-2.41/debian/patches/git-updates.diff 2025-12-28 16:32:28.000000000 +0000 @@ -22,10 +22,10 @@ $(common-objdir):$(subst $(empty) ,:,$(patsubst ../$(subdir),.,$(rpath-dirs:%=$(common-objpfx)%))) else # build-static diff --git a/NEWS b/NEWS -index b11422b060..89d0935beb 100644 +index b11422b060..f77d1471c3 100644 --- a/NEWS +++ b/NEWS -@@ -5,6 +5,36 @@ See the end for copying conditions. +@@ -5,6 +5,39 @@ See the end for copying conditions. Please send GNU C library bug reports via using `glibc' in the "product" field. @@ -39,6 +39,7 @@ + +The following bugs were resolved with this release: + ++ [31943] _dl_find_object can fail if ld.so contains gaps between load segments + [32269] RISC-V IFUNC resolver cannot access gp pointer + [32626] math: math: log10p1f is not correctly rounded + [32627] math: math: sinhf is not correctly rounded @@ -56,8 +57,10 @@ + [32981] ports: elf/tst-execstack-prog-static-tunable fails on + sparc64-linux-gnu + [32987] elf: Fix subprocess status handling for tst-dlopen-sgid ++ [32994] stdlib: resolve a double lock init issue after fork + [33164] iconv -o should not create executable files + [33185] Fix double-free after allocation failure in regcomp ++ [33245] nptl: nptl: error in internal cancellation syscall handling + Version 2.41 @@ -909,14 +912,23 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#ifdef PIE_UNSUPPORTED # error PIE is not supported diff --git a/elf/Makefile b/elf/Makefile -index 4b1d0d8741..f42cb154fb 100644 +index 4b1d0d8741..b8064ef14c 100644 --- a/elf/Makefile +++ b/elf/Makefile -@@ -61,6 +61,7 @@ dl-routines = \ +@@ -34,7 +34,6 @@ routines = \ + dl-addr \ + dl-addr-obj \ + dl-early_allocate \ +- dl-find_object \ + dl-iteratephdr \ + dl-libc \ + dl-origin \ +@@ -61,6 +60,8 @@ dl-routines = \ dl-deps \ dl-exception \ dl-execstack \ + dl-execstack-tunable \ ++ dl-find_object \ dl-fini \ dl-init \ dl-load \ @@ -936,7 +948,27 @@ tst-audit1 \ tst-audit2 \ tst-audit8 \ -@@ -567,9 +570,11 @@ tests-execstack-yes = \ +@@ -532,6 +535,8 @@ tests-internal += \ + tst-dl_find_object-threads \ + tst-dlmopen2 \ + tst-hash-collision3 \ ++ tst-link-map-contiguous-ldso \ ++ tst-link-map-contiguous-libc \ + tst-ptrguard1 \ + tst-stackguard1 \ + tst-tls-surplus \ +@@ -543,6 +548,10 @@ tests-internal += \ + unload2 \ + # tests-internal + ++ifeq ($(build-hardcoded-path-in-tests),yes) ++tests-internal += tst-link-map-contiguous-main ++endif ++ + tests-container += \ + tst-dlopen-self-container \ + tst-dlopen-tlsmodid-container \ +@@ -567,9 +576,11 @@ tests-execstack-yes = \ tst-execstack \ tst-execstack-needed \ tst-execstack-prog \ @@ -949,7 +981,7 @@ # tests-execstack-static-yes ifeq (yes,$(run-built-tests)) tests-execstack-special-yes = \ -@@ -863,6 +868,7 @@ modules-names += \ +@@ -863,6 +874,7 @@ modules-names += \ tst-auditmanymod8 \ tst-auditmanymod9 \ tst-auditmod-tlsdesc \ @@ -957,7 +989,7 @@ tst-auditmod1 \ tst-auditmod11 \ tst-auditmod12 \ -@@ -905,6 +911,7 @@ modules-names += \ +@@ -905,6 +917,7 @@ modules-names += \ tst-dlmopen1mod \ tst-dlopen-auditdup-auditmod \ tst-dlopen-auditdupmod \ @@ -965,7 +997,7 @@ tst-dlopen-tlsreinitmod1 \ tst-dlopen-tlsreinitmod2 \ tst-dlopen-tlsreinitmod3 \ -@@ -1144,6 +1151,10 @@ tests-pie += \ +@@ -1144,6 +1157,10 @@ tests-pie += \ tst-pie1 \ tst-pie2 \ # tests-pie @@ -976,7 +1008,7 @@ ifneq (,$(load-address-ldflag)) tests += \ tst-pie-address \ -@@ -1159,6 +1170,10 @@ tests += \ +@@ -1159,6 +1176,10 @@ tests += \ tests-static += \ tst-pie-address-static \ # tests-static @@ -987,7 +1019,7 @@ LDFLAGS-tst-pie-address-static += \ $(load-address-ldflag)=$(pde-load-address) endif -@@ -1988,6 +2003,9 @@ $(objpfx)tst-execstack.out: $(objpfx)tst-execstack-mod.so +@@ -1988,6 +2009,9 @@ $(objpfx)tst-execstack.out: $(objpfx)tst-execstack-mod.so CPPFLAGS-tst-execstack.c += -DUSE_PTHREADS=0 LDFLAGS-tst-execstack = -Wl,-z,noexecstack LDFLAGS-tst-execstack-mod.so = -Wl,-z,execstack @@ -997,7 +1029,7 @@ $(objpfx)tst-execstack-needed: $(objpfx)tst-execstack-mod.so LDFLAGS-tst-execstack-needed = -Wl,-z,noexecstack -@@ -1996,7 +2014,18 @@ LDFLAGS-tst-execstack-prog = -Wl,-z,execstack +@@ -1996,7 +2020,18 @@ LDFLAGS-tst-execstack-prog = -Wl,-z,execstack CFLAGS-tst-execstack-prog.c += -Wno-trampolines CFLAGS-tst-execstack-mod.c += -Wno-trampolines @@ -1016,7 +1048,7 @@ CFLAGS-tst-execstack-prog-static.c += -Wno-trampolines ifeq (yes,$(build-hardcoded-path-in-tests)) -@@ -2074,6 +2103,7 @@ $(objpfx)tst-array5-static-cmp.out: tst-array5-static.exp \ +@@ -2074,6 +2109,7 @@ $(objpfx)tst-array5-static-cmp.out: tst-array5-static.exp \ CFLAGS-tst-pie1.c += $(pie-ccflag) CFLAGS-tst-pie2.c += $(pie-ccflag) @@ -1024,7 +1056,7 @@ CFLAGS-tst-pie-address.c += $(pie-ccflag) $(objpfx)tst-piemod1.so: $(libsupport) -@@ -3189,6 +3219,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so +@@ -3189,6 +3225,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so @@ -1034,7 +1066,7 @@ $(objpfx)tst-dlmopen-twice.out: \ $(objpfx)tst-dlmopen-twice-mod1.so \ -@@ -3392,3 +3425,5 @@ $(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o +@@ -3392,3 +3431,5 @@ $(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o -Wl,--dynamic-linker=$(objpfx)ld.so $(objpfx)tst-nolink-libc-2.out: $(objpfx)tst-nolink-libc-2 $(objpfx)ld.so $< > $@ 2>&1; $(evaluate-test) @@ -1098,6 +1130,146 @@ { return ENOSYS; } +diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c +index 513e464011..c9f4c1c8d1 100644 +--- a/elf/dl-find_object.c ++++ b/elf/dl-find_object.c +@@ -356,7 +356,7 @@ _dlfo_lookup (uintptr_t pc, struct dl_find_object_internal *first1, size_t size) + } + + int +-__dl_find_object (void *pc1, struct dl_find_object *result) ++_dl_find_object (void *pc1, struct dl_find_object *result) + { + uintptr_t pc = (uintptr_t) pc1; + +@@ -463,8 +463,38 @@ __dl_find_object (void *pc1, struct dl_find_object *result) + return -1; + } /* Transaction retry loop. */ + } +-hidden_def (__dl_find_object) +-weak_alias (__dl_find_object, _dl_find_object) ++rtld_hidden_def (_dl_find_object) ++ ++/* Subroutine of _dlfo_process_initial to split out noncontigous link ++ maps. NODELETE is the number of used _dlfo_nodelete_mappings ++ elements. It is incremented as needed, and the new NODELETE value ++ is returned. */ ++static size_t ++_dlfo_process_initial_noncontiguous_map (struct link_map *map, ++ size_t nodelete) ++{ ++ struct dl_find_object_internal dlfo; ++ _dl_find_object_from_map (map, &dlfo); ++ ++ /* PT_LOAD segments for a non-contiguous link map are added to the ++ non-closeable mappings. */ ++ const ElfW(Phdr) *ph = map->l_phdr; ++ const ElfW(Phdr) *ph_end = map->l_phdr + map->l_phnum; ++ for (; ph < ph_end; ++ph) ++ if (ph->p_type == PT_LOAD) ++ { ++ if (_dlfo_nodelete_mappings != NULL) ++ { ++ /* Second pass only. */ ++ _dlfo_nodelete_mappings[nodelete] = dlfo; ++ ElfW(Addr) start = ph->p_vaddr + map->l_addr; ++ _dlfo_nodelete_mappings[nodelete].map_start = start; ++ _dlfo_nodelete_mappings[nodelete].map_end = start + ph->p_memsz; ++ } ++ ++nodelete; ++ } ++ return nodelete; ++} + + /* _dlfo_process_initial is called twice. First to compute the array + sizes from the initial loaded mappings. Second to fill in the +@@ -477,29 +507,8 @@ _dlfo_process_initial (void) + + size_t nodelete = 0; + if (!main_map->l_contiguous) +- { +- struct dl_find_object_internal dlfo; +- _dl_find_object_from_map (main_map, &dlfo); +- +- /* PT_LOAD segments for a non-contiguous are added to the +- non-closeable mappings. */ +- for (const ElfW(Phdr) *ph = main_map->l_phdr, +- *ph_end = main_map->l_phdr + main_map->l_phnum; +- ph < ph_end; ++ph) +- if (ph->p_type == PT_LOAD) +- { +- if (_dlfo_nodelete_mappings != NULL) +- { +- /* Second pass only. */ +- _dlfo_nodelete_mappings[nodelete] = dlfo; +- _dlfo_nodelete_mappings[nodelete].map_start +- = ph->p_vaddr + main_map->l_addr; +- _dlfo_nodelete_mappings[nodelete].map_end +- = _dlfo_nodelete_mappings[nodelete].map_start + ph->p_memsz; +- } +- ++nodelete; +- } +- } ++ /* Contiguous case already handled in _dl_find_object_init. */ ++ nodelete = _dlfo_process_initial_noncontiguous_map (main_map, nodelete); + + size_t loaded = 0; + for (Lmid_t ns = 0; ns < GL(dl_nns); ++ns) +@@ -511,11 +520,18 @@ _dlfo_process_initial (void) + /* lt_library link maps are implicitly NODELETE. */ + if (l->l_type == lt_library || l->l_nodelete_active) + { +- if (_dlfo_nodelete_mappings != NULL) +- /* Second pass only. */ +- _dl_find_object_from_map +- (l, _dlfo_nodelete_mappings + nodelete); +- ++nodelete; ++ /* The kernel may have loaded ld.so with gaps. */ ++ if (!l->l_contiguous && is_rtld_link_map (l)) ++ nodelete ++ = _dlfo_process_initial_noncontiguous_map (l, nodelete); ++ else ++ { ++ if (_dlfo_nodelete_mappings != NULL) ++ /* Second pass only. */ ++ _dl_find_object_from_map ++ (l, _dlfo_nodelete_mappings + nodelete); ++ ++nodelete; ++ } + } + else if (l->l_type == lt_loaded) + { +@@ -765,7 +781,6 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count) + /* Prefer newly loaded link map. */ + assert (loaded_index1 > 0); + _dl_find_object_from_map (loaded[loaded_index1 - 1], dlfo); +- loaded[loaded_index1 - 1]->l_find_object_processed = 1; + --loaded_index1; + } + +diff --git a/elf/dl-find_object.h b/elf/dl-find_object.h +index e433ff8740..563af3de1d 100644 +--- a/elf/dl-find_object.h ++++ b/elf/dl-find_object.h +@@ -87,7 +87,7 @@ _dl_find_object_to_external (struct dl_find_object_internal *internal, + } + + /* Extract the object location data from a link map and writes it to +- *RESULT using relaxed MO stores. */ ++ *RESULT using relaxed MO stores. Set L->l_find_object_processed. */ + static void __attribute__ ((unused)) + _dl_find_object_from_map (struct link_map *l, + struct dl_find_object_internal *result) +@@ -100,6 +100,8 @@ _dl_find_object_from_map (struct link_map *l, + atomic_store_relaxed (&result->eh_dbase, (void *) l->l_info[DT_PLTGOT]); + #endif + ++ l->l_find_object_processed = 1; ++ + for (const ElfW(Phdr) *ph = l->l_phdr, *ph_end = l->l_phdr + l->l_phnum; + ph < ph_end; ++ph) + if (ph->p_type == DLFO_EH_SEGMENT_TYPE) diff --git a/elf/dl-load.c b/elf/dl-load.c index f905578a65..945dd8a231 100644 --- a/elf/dl-load.c @@ -1181,10 +1353,71 @@ } } diff --git a/elf/rtld.c b/elf/rtld.c -index 00bec15316..7a8aa56377 100644 +index 00bec15316..c1e9721def 100644 --- a/elf/rtld.c +++ b/elf/rtld.c -@@ -1626,9 +1626,9 @@ dl_main (const ElfW(Phdr) *phdr, +@@ -1242,6 +1242,60 @@ rtld_setup_main_map (struct link_map *main_map) + return has_interp; + } + ++/* Set up the program header information for the dynamic linker ++ itself. It can be accessed via _r_debug and dl_iterate_phdr ++ callbacks, and it is used by _dl_find_object. */ ++static void ++rtld_setup_phdr (void) ++{ ++ /* Starting from binutils-2.23, the linker will define the magic ++ symbol __ehdr_start to point to our own ELF header if it is ++ visible in a segment that also includes the phdrs. */ ++ ++ const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start; ++ assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr); ++ assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr))); ++ ++ const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff; ++ ++ _dl_rtld_map.l_phdr = rtld_phdr; ++ _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum; ++ ++ ++ _dl_rtld_map.l_contiguous = 1; ++ /* The linker may not have produced a contiguous object. The kernel ++ will load the object with actual gaps (unlike the glibc loader ++ for shared objects, which always produces a contiguous mapping). ++ See similar logic in rtld_setup_main_map above. */ ++ { ++ ElfW(Addr) expected_load_address = 0; ++ for (const ElfW(Phdr) *ph = rtld_phdr; ph < &rtld_phdr[rtld_ehdr->e_phnum]; ++ ++ph) ++ if (ph->p_type == PT_LOAD) ++ { ++ ElfW(Addr) mapstart = ph->p_vaddr & ~(GLRO(dl_pagesize) - 1); ++ if (_dl_rtld_map.l_contiguous && expected_load_address != 0 ++ && expected_load_address != mapstart) ++ _dl_rtld_map.l_contiguous = 0; ++ ElfW(Addr) allocend = ph->p_vaddr + ph->p_memsz; ++ /* The next expected address is the page following this load ++ segment. */ ++ expected_load_address = ((allocend + GLRO(dl_pagesize) - 1) ++ & ~(GLRO(dl_pagesize) - 1)); ++ } ++ } ++ ++ /* PT_GNU_RELRO is usually the last phdr. */ ++ size_t cnt = rtld_ehdr->e_phnum; ++ while (cnt-- > 0) ++ if (rtld_phdr[cnt].p_type == PT_GNU_RELRO) ++ { ++ _dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr; ++ _dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz; ++ break; ++ } ++} ++ + /* Adjusts the contents of the stack and related globals for the user + entry point. The ld.so processed skip_args arguments and bumped + _dl_argv and _dl_argc accordingly. Those arguments are removed from +@@ -1626,9 +1680,9 @@ dl_main (const ElfW(Phdr) *phdr, bool has_interp = rtld_setup_main_map (main_map); @@ -1197,6 +1430,41 @@ /* If the current libname is different from the SONAME, add the latter as well. */ +@@ -1710,33 +1764,7 @@ dl_main (const ElfW(Phdr) *phdr, + ++GL(dl_ns)[LM_ID_BASE]._ns_nloaded; + ++GL(dl_load_adds); + +- /* Starting from binutils-2.23, the linker will define the magic symbol +- __ehdr_start to point to our own ELF header if it is visible in a +- segment that also includes the phdrs. If that's not available, we use +- the old method that assumes the beginning of the file is part of the +- lowest-addressed PT_LOAD segment. */ +- +- /* Set up the program header information for the dynamic linker +- itself. It is needed in the dl_iterate_phdr callbacks. */ +- const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start; +- assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr); +- assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr))); +- +- const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff; +- +- _dl_rtld_map.l_phdr = rtld_phdr; +- _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum; +- +- +- /* PT_GNU_RELRO is usually the last phdr. */ +- size_t cnt = rtld_ehdr->e_phnum; +- while (cnt-- > 0) +- if (rtld_phdr[cnt].p_type == PT_GNU_RELRO) +- { +- _dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr; +- _dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz; +- break; +- } ++ rtld_setup_phdr (); + + /* Add the dynamic linker to the TLS list if it also uses TLS. */ + if (_dl_rtld_map.l_tls_blocksize != 0) diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c new file mode 100644 index 0000000000..7ba2c4129a @@ -1518,6 +1786,224 @@ +++ b/elf/tst-execstack-tunable.c @@ -0,0 +1 @@ +#include +diff --git a/elf/tst-link-map-contiguous-ldso.c b/elf/tst-link-map-contiguous-ldso.c +new file mode 100644 +index 0000000000..04de808bb2 +--- /dev/null ++++ b/elf/tst-link-map-contiguous-ldso.c +@@ -0,0 +1,98 @@ ++/* Check that _dl_find_object behavior matches up with gaps. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int ++do_test (void) ++{ ++ struct link_map *l = xdlopen (LD_SO, RTLD_NOW); ++ if (!l->l_contiguous) ++ { ++ puts ("info: ld.so link map is not contiguous"); ++ ++ /* Try to find holes by probing with mmap. */ ++ int pagesize = getpagesize (); ++ bool gap_found = false; ++ ElfW(Addr) addr = l->l_map_start; ++ TEST_COMPARE (addr % pagesize, 0); ++ while (addr < l->l_map_end) ++ { ++ void *expected = (void *) addr; ++ void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1); ++ struct dl_find_object dlfo; ++ int dlfo_ret = _dl_find_object (expected, &dlfo); ++ if (ptr == expected) ++ { ++ if (dlfo_ret < 0) ++ { ++ TEST_COMPARE (dlfo_ret, -1); ++ printf ("info: hole without mapping data found at %p\n", ptr); ++ } ++ else ++ FAIL ("object \"%s\" found in gap at %p", ++ dlfo.dlfo_link_map->l_name, ptr); ++ gap_found = true; ++ } ++ else if (dlfo_ret == 0) ++ { ++ if ((void *) dlfo.dlfo_link_map != (void *) l) ++ { ++ printf ("info: object \"%s\" found at %p\n", ++ dlfo.dlfo_link_map->l_name, ptr); ++ gap_found = true; ++ } ++ } ++ else ++ TEST_COMPARE (dlfo_ret, -1); ++ xmunmap (ptr, 1); ++ addr += pagesize; ++ } ++ if (!gap_found) ++ FAIL ("no ld.so gap found"); ++ } ++ else ++ { ++ puts ("info: ld.so link map is contiguous"); ++ ++ /* Assert that ld.so is truly contiguous in memory. */ ++ volatile long int *p = (volatile long int *) l->l_map_start; ++ volatile long int *end = (volatile long int *) l->l_map_end; ++ while (p < end) ++ { ++ *p; ++ ++p; ++ } ++ } ++ ++ xdlclose (l); ++ ++ return 0; ++} ++ ++#include +diff --git a/elf/tst-link-map-contiguous-libc.c b/elf/tst-link-map-contiguous-libc.c +new file mode 100644 +index 0000000000..eb5728c765 +--- /dev/null ++++ b/elf/tst-link-map-contiguous-libc.c +@@ -0,0 +1,57 @@ ++/* Check that the entire libc.so program image is readable if contiguous. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int ++do_test (void) ++{ ++ struct link_map *l = xdlopen (LIBC_SO, RTLD_NOW); ++ ++ /* The dynamic loader fills holes with PROT_NONE mappings. */ ++ if (!l->l_contiguous) ++ FAIL_EXIT1 ("libc.so link map is not contiguous"); ++ ++ /* Direct probing does not work because not everything is readable ++ due to PROT_NONE mappings. */ ++ int pagesize = getpagesize (); ++ ElfW(Addr) addr = l->l_map_start; ++ TEST_COMPARE (addr % pagesize, 0); ++ while (addr < l->l_map_end) ++ { ++ void *expected = (void *) addr; ++ void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1); ++ if (ptr == expected) ++ FAIL ("hole in libc.so memory image after %lu bytes", ++ (unsigned long int) (addr - l->l_map_start)); ++ xmunmap (ptr, 1); ++ addr += pagesize; ++ } ++ ++ xdlclose (l); ++ ++ return 0; ++} ++#include +diff --git a/elf/tst-link-map-contiguous-main.c b/elf/tst-link-map-contiguous-main.c +new file mode 100644 +index 0000000000..2d1a054f0f +--- /dev/null ++++ b/elf/tst-link-map-contiguous-main.c +@@ -0,0 +1,45 @@ ++/* Check that the entire main program image is readable if contiguous. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++ ++static int ++do_test (void) ++{ ++ struct link_map *l = xdlopen ("", RTLD_NOW); ++ if (!l->l_contiguous) ++ FAIL_UNSUPPORTED ("main link map is not contiguous"); ++ ++ /* This check only works if the kernel loaded the main program. The ++ dynamic loader replaces gaps with PROT_NONE mappings, resulting ++ in faults. */ ++ volatile long int *p = (volatile long int *) l->l_map_start; ++ volatile long int *end = (volatile long int *) l->l_map_end; ++ while (p < end) ++ { ++ *p; ++ ++p; ++ } ++ ++ xdlclose (l); ++ ++ return 0; ++} ++#include diff --git a/elf/tst-pie-bss-static.c b/elf/tst-pie-bss-static.c new file mode 100644 index 0000000000..5df542f9ee @@ -1628,6 +2114,20 @@ if ! cmp -s "$tmp/out" "$tmp/expected" ; then echo "error: iconv output difference" >&$logfd echo "*** expected ***" >&$logfd +diff --git a/include/dlfcn.h b/include/dlfcn.h +index f49ee1b0c9..a44420fa37 100644 +--- a/include/dlfcn.h ++++ b/include/dlfcn.h +@@ -4,8 +4,7 @@ + #include /* For ElfW. */ + #include + +-extern __typeof (_dl_find_object) __dl_find_object; +-hidden_proto (__dl_find_object) ++rtld_hidden_proto (_dl_find_object) + + /* Internally used flag. */ + #define __RTLD_DLOPEN 0x80000000 diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in index 01ba689aa8..4f194da19d 100644 --- a/math/auto-libm-test-in @@ -1792,6 +2292,21 @@ tst-stackguard1-ARGS = --command "$(host-test-program-cmd) --child" tst-stackguard1-static-ARGS = --command "$(objpfx)tst-stackguard1-static --child" +diff --git a/nptl/cancellation.c b/nptl/cancellation.c +index 156e63dcf0..bed0383a23 100644 +--- a/nptl/cancellation.c ++++ b/nptl/cancellation.c +@@ -72,8 +72,8 @@ __syscall_cancel (__syscall_arg_t a1, __syscall_arg_t a2, + __syscall_arg_t a5, __syscall_arg_t a6, + __SYSCALL_CANCEL7_ARG_DEF __syscall_arg_t nr) + { +- int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6, +- __SYSCALL_CANCEL7_ARG nr); ++ long int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6, ++ __SYSCALL_CANCEL7_ARG nr); + return __glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (r)) + ? SYSCALL_ERROR_LABEL (INTERNAL_SYSCALL_ERRNO (r)) + : r; diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c index f7ce3ec51b..b838273881 100644 --- a/nptl/pthread_cancel.c @@ -2091,6 +2606,30 @@ tst-secure-getenv \ # tests-static +diff --git a/stdlib/abort.c b/stdlib/abort.c +index caa9e6dc04..904244a2fb 100644 +--- a/stdlib/abort.c ++++ b/stdlib/abort.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + + /* Try to get a machine dependent instruction which will make the +@@ -42,7 +43,10 @@ __libc_rwlock_define_initialized (static, lock); + void + __abort_fork_reset_child (void) + { +- __libc_rwlock_init (lock); ++ /* Reinitialize lock without calling pthread_rwlock_init, to ++ avoid a valgrind DRD false positive. */ ++ __libc_rwlock_define_initialized (, reset_lock); ++ memcpy (&lock, &reset_lock, sizeof (lock)); + } + + void diff --git a/stdlib/getenv.c b/stdlib/getenv.c index 5e7212cca6..1a7b0bfc06 100644 --- a/stdlib/getenv.c @@ -2424,6 +2963,521 @@ } void +diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c +index 7709b5454f..453f780314 100644 +--- a/sysdeps/aarch64/fpu/acos_advsimd.c ++++ b/sysdeps/aarch64/fpu/acos_advsimd.c +@@ -18,24 +18,23 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f64.h" + + static const struct data + { +- float64x2_t poly[12]; +- float64x2_t pi, pi_over_2; ++ double c1, c3, c5, c7, c9, c11; ++ float64x2_t c0, c2, c4, c6, c8, c10; + uint64x2_t abs_mask; ++ float64x2_t pi, pi_over_2; + } data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ +- .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), +- V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), +- V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), +- V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), +- V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), +- V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, +- .pi = V2 (0x1.921fb54442d18p+1), +- .pi_over_2 = V2 (0x1.921fb54442d18p+0), ++ .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, ++ .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, ++ .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, ++ .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, ++ .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, ++ .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, ++ .pi = V2 (0x1.921fb54442d18p+1), .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), + }; + +@@ -63,7 +62,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + +- The largest observed error in this region is 1.18 ulps, ++ The largest observed error in this region is 1.18 ulp: + _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + +@@ -71,9 +70,9 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + +- The largest observed error in this region is 1.52 ulps, +- _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 +- want 0x1.edbbedf8a7d6cp-1. */ ++ The largest observed error in this region is 1.50 ulp: ++ _ZGVnN2v_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1 ++ want 0x1.ec1a46aa829p-1. */ + float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +@@ -99,13 +98,32 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) + float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ ++ float64x2_t z3 = vmulq_f64 (z2, z); + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); +- float64x2_t z16 = vmulq_f64 (z8, z8); +- float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + +- /* Finalize polynomial: z + z * z2 * P(z2). */ +- p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); ++ /* Order-11 Estrin. */ ++ float64x2_t c13 = vld1q_f64 (&d->c1); ++ float64x2_t c57 = vld1q_f64 (&d->c5); ++ float64x2_t c911 = vld1q_f64 (&d->c9); ++ ++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); ++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); ++ float64x2_t p03 = vfmaq_f64 (p01, z4, p23); ++ ++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); ++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); ++ float64x2_t p47 = vfmaq_f64 (p45, z4, p67); ++ ++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); ++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); ++ float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); ++ ++ float64x2_t p411 = vfmaq_f64 (p47, z8, p811); ++ float64x2_t p = vfmaq_f64 (p03, z8, p411); ++ ++ /* Finalize polynomial: z + z3 * P(z2). */ ++ p = vfmaq_f64 (z, z3, p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 +diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c +index 74e2f7df0f..104f0d7805 100644 +--- a/sysdeps/aarch64/fpu/acos_sve.c ++++ b/sysdeps/aarch64/fpu/acos_sve.c +@@ -18,20 +18,21 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct data + { +- float64_t poly[12]; +- float64_t pi, pi_over_2; ++ float64_t c1, c3, c5, c7, c9, c11; ++ float64_t c0, c2, c4, c6, c8, c10; ++ float64_t pi_over_2; + } data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ +- .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, +- 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, +- 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, +- 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, +- .pi = 0x1.921fb54442d18p+1, ++ .c0 = 0x1.555555555554ep-3, .c1 = 0x1.3333333337233p-4, ++ .c2 = 0x1.6db6db67f6d9fp-5, .c3 = 0x1.f1c71fbd29fbbp-6, ++ .c4 = 0x1.6e8b264d467d6p-6, .c5 = 0x1.1c5997c357e9dp-6, ++ .c6 = 0x1.c86a22cd9389dp-7, .c7 = 0x1.856073c22ebbep-7, ++ .c8 = 0x1.fd1151acb6bedp-8, .c9 = 0x1.087182f799c1dp-6, ++ .c10 = -0x1.6602748120927p-7, .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = 0x1.921fb54442d18p+0, + }; + +@@ -42,20 +43,21 @@ static const struct data + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + +- The largest observed error in this region is 1.18 ulps, +- _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0 +- want 0x1.0d4d0f55667f7p+0. ++ The largest observed error in this region is 1.18 ulp: ++ _ZGVsMxv_acos (0x1.fbb7c9079b429p-2) got 0x1.0d51266607582p+0 ++ want 0x1.0d51266607583p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + +- The largest observed error in this region is 1.52 ulps, +- _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1 +- want 0x1.ed82df4243f0bp-1. */ ++ The largest observed error in this region is 1.50 ulp: ++ _ZGVsMxv_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1 ++ want 0x1.ec1a46aa829p-1. */ + svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); ++ svbool_t ptrue = svptrue_b64 (); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); +@@ -70,24 +72,41 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) + svfloat64_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ +- svfloat64_t z4 = svmul_x (pg, z2, z2); +- svfloat64_t z8 = svmul_x (pg, z4, z4); +- svfloat64_t z16 = svmul_x (pg, z8, z8); +- svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); ++ svfloat64_t z3 = svmul_x (ptrue, z2, z); ++ svfloat64_t z4 = svmul_x (ptrue, z2, z2); ++ svfloat64_t z8 = svmul_x (ptrue, z4, z4); ++ ++ svfloat64_t c13 = svld1rq (ptrue, &d->c1); ++ svfloat64_t c57 = svld1rq (ptrue, &d->c5); ++ svfloat64_t c911 = svld1rq (ptrue, &d->c9); ++ ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); ++ svfloat64_t p03 = svmla_x (pg, p01, z4, p23); ++ ++ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); ++ svfloat64_t p47 = svmla_x (pg, p45, z4, p67); ++ ++ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); ++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); ++ svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); ++ ++ svfloat64_t p411 = svmla_x (pg, p47, z8, p811); ++ svfloat64_t p = svmad_x (pg, p411, z8, p03); + + /* Finalize polynomial: z + z * z2 * P(z2). */ +- p = svmla_x (pg, z, svmul_x (pg, z, z2), p); ++ p = svmad_x (pg, p, z3, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ +- svfloat64_t y +- = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign)); +- +- svbool_t is_neg = svcmplt (pg, x, 0.0); +- svfloat64_t off = svdup_f64_z (is_neg, d->pi); +- svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0)); +- svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2)); +- +- return svmla_x (pg, add, mul, y); ++ svfloat64_t mul = svreinterpret_f64 ( ++ svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10)); ++ mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign)); ++ svfloat64_t add = svreinterpret_f64 ( ++ svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->pi_over_2)))); ++ add = svsub_m (a_gt_half, sv_f64 (d->pi_over_2), add); ++ ++ return svmsb_x (pg, p, mul, add); + } +diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c +index 326b2cca2e..3a84959f0a 100644 +--- a/sysdeps/aarch64/fpu/acosh_sve.c ++++ b/sysdeps/aarch64/fpu/acosh_sve.c +@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) + } + + /* SVE approximation for double-precision acosh, based on log1p. +- The largest observed error is 3.19 ULP in the region where the ++ The largest observed error is 3.14 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: +- SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 +- want 0x1.ed23399f51373p-2. */ ++ SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2 ++ want 0x1.ef0cee7c33ce4p-2. */ + svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) + { + /* (ix - One) >= (BigBound - One). */ +diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c +index 414211627e..f74141c845 100644 +--- a/sysdeps/aarch64/fpu/asin_advsimd.c ++++ b/sysdeps/aarch64/fpu/asin_advsimd.c +@@ -18,24 +18,23 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f64.h" + + static const struct data + { +- float64x2_t poly[12]; ++ float64x2_t c0, c2, c4, c6, c8, c10; + float64x2_t pi_over_2; + uint64x2_t abs_mask; ++ double c1, c3, c5, c7, c9, c11; + } data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ +- .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), +- V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), +- V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), +- V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), +- V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), +- V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, +- .pi_over_2 = V2 (0x1.921fb54442d18p+0), +- .abs_mask = V2 (0x7fffffffffffffff), ++ .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, ++ .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, ++ .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, ++ .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, ++ .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, ++ .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, ++ .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), + }; + + #define AllMask v_u64 (0xffffffffffffffff) +@@ -68,8 +67,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, +- _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 +- want 0x1.110d7e85fdd53p-1. */ ++ _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 ++ want 0x1.1111dd54ddf99p-1. */ + float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +@@ -86,7 +85,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) + return special_case (x, x, AllMask); + #endif + +- uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5)); ++ uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 +@@ -99,7 +98,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); +- float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); ++ ++ /* order-11 estrin. */ ++ float64x2_t c13 = vld1q_f64 (&d->c1); ++ float64x2_t c57 = vld1q_f64 (&d->c5); ++ float64x2_t c911 = vld1q_f64 (&d->c9); ++ ++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); ++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); ++ float64x2_t p03 = vfmaq_f64 (p01, z4, p23); ++ ++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); ++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); ++ float64x2_t p47 = vfmaq_f64 (p45, z4, p67); ++ ++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); ++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); ++ float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); ++ ++ float64x2_t p07 = vfmaq_f64 (p03, z8, p47); ++ float64x2_t p = vfmaq_f64 (p07, z16, p811); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); +diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c +index 9314466f58..975f408bee 100644 +--- a/sysdeps/aarch64/fpu/asin_sve.c ++++ b/sysdeps/aarch64/fpu/asin_sve.c +@@ -18,45 +18,43 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct data + { +- float64_t poly[12]; +- float64_t pi_over_2f; ++ float64_t c1, c3, c5, c7, c9, c11; ++ float64_t c0, c2, c4, c6, c8, c10; ++ float64_t pi_over_2; + } data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ +- .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, +- 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6, +- 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, +- 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, +- 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6, +- -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, +- .pi_over_2f = 0x1.921fb54442d18p+0, ++ .c0 = 0x1.555555555554ep-3, .c1 = 0x1.3333333337233p-4, ++ .c2 = 0x1.6db6db67f6d9fp-5, .c3 = 0x1.f1c71fbd29fbbp-6, ++ .c4 = 0x1.6e8b264d467d6p-6, .c5 = 0x1.1c5997c357e9dp-6, ++ .c6 = 0x1.c86a22cd9389dp-7, .c7 = 0x1.856073c22ebbep-7, ++ .c8 = 0x1.fd1151acb6bedp-8, .c9 = 0x1.087182f799c1dp-6, ++ .c10 = -0x1.6602748120927p-7, .c11 = 0x1.cfa0dd1f9478p-6, ++ .pi_over_2 = 0x1.921fb54442d18p+0, + }; + +-#define P(i) sv_f64 (d->poly[i]) +- + /* Double-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + +- The largest observed error in this region is 0.52 ulps, +- _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2 +- want 0x1.ec13757305f26p-2. +- +- For |x| in [0.5, 1.0], use same approximation with a change of variable ++ The largest observed error in this region is 0.98 ulp: ++ _ZGVsMxv_asin (0x1.d98f6a748ed8ap-2) got 0x1.ec4eb661a73d3p-2 ++ want 0x1.ec4eb661a73d2p-2. + +- asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). ++ For |x| in [0.5, 1.0], use same approximation with a change of variable: ++ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + +- The largest observed error in this region is 2.69 ulps, +- _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 +- want 0x1.110d7e85fdd53p-1. */ ++ The largest observed error in this region is 2.66 ulp: ++ _ZGVsMxv_asin (0x1.04024f6e2a2fbp-1) got 0x1.10b9586f087a8p-1 ++ want 0x1.10b9586f087abp-1. */ + svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); ++ svbool_t ptrue = svptrue_b64 (); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); +@@ -70,17 +68,37 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) + svfloat64_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ ++ svfloat64_t z3 = svmul_x (pg, z2, z); + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); +- svfloat64_t z16 = svmul_x (pg, z8, z8); +- svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); ++ ++ svfloat64_t c13 = svld1rq (ptrue, &d->c1); ++ svfloat64_t c57 = svld1rq (ptrue, &d->c5); ++ svfloat64_t c911 = svld1rq (ptrue, &d->c9); ++ ++ /* Order-11 Estrin scheme. */ ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); ++ svfloat64_t p03 = svmla_x (pg, p01, z4, p23); ++ ++ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); ++ svfloat64_t p47 = svmla_x (pg, p45, z4, p67); ++ ++ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); ++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); ++ svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); ++ ++ svfloat64_t p411 = svmla_x (pg, p47, z8, p811); ++ svfloat64_t p = svmla_x (pg, p03, z8, p411); ++ + /* Finalize polynomial: z + z * z2 * P(z2). */ +- p = svmla_x (pg, z, svmul_x (pg, z, z2), p); ++ p = svmla_x (pg, z, z3, p); + +- /* asin(|x|) = Q(|x|) , for |x| < 0.5 +- = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ +- svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f); ++ /* asin(|x|) = Q(|x|), for |x| < 0.5 ++ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ ++ svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2); + +- /* Copy sign. */ ++ /* Reinsert the sign from the argument. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); + } +diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c +index 52c7c0ec6e..013936c2c0 100644 +--- a/sysdeps/aarch64/fpu/asinf_advsimd.c ++++ b/sysdeps/aarch64/fpu/asinf_advsimd.c +@@ -18,22 +18,21 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f32.h" + + static const struct data + { +- float32x4_t poly[5]; ++ float32x4_t c0, c2, c4; ++ float c1, c3; + float32x4_t pi_over_2f; + } data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ +- .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), +- V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, +- .pi_over_2f = V4 (0x1.921fb6p+0f), ++ .c0 = V4 (0x1.55555ep-3f), .c1 = 0x1.33261ap-4f, ++ .c2 = V4 (0x1.70d7dcp-5f), .c3 = 0x1.b059dp-6f, ++ .c4 = V4 (0x1.3af7d8p-5f), .pi_over_2f = V4 (0x1.921fb6p+0f), + }; + + #define AbsMask 0x7fffffff +-#define Half 0x3f000000 + #define One 0x3f800000 + #define Small 0x39800000 /* 2^-12. */ + +@@ -47,11 +46,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) + + /* Single-precision implementation of vector asin(x). + +- For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct +- rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the +- following approximation. + +- For |x| in [Small, 0.5], use order 4 polynomial P such that the final ++ For |x| <0.5, use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, +@@ -80,24 +76,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) + #endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); +- uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); ++ uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), +- vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); ++ vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f)); + float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ +- float32x4_t p = v_horner_4_f32 (z2, d->poly); ++ ++ /* PW Horner 3 evaluation scheme. */ ++ float32x4_t z4 = vmulq_f32 (z2, z2); ++ float32x4_t c13 = vld1q_f32 (&d->c1); ++ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c13, 0); ++ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c13, 1); ++ float32x4_t p = vfmaq_f32 (p23, d->c4, z4); ++ p = vfmaq_f32 (p01, p, z4); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float32x4_t y +- = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); ++ = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0f)); + + /* Copy sign. */ + return vbslq_f32 (v_u32 (AbsMask), y, x); diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c index 0889f79dbb..ff6b71390c 100644 --- a/sysdeps/aarch64/fpu/asinh_sve.c @@ -2607,69 +3661,1326 @@ + svfloat64_t y = svsel (ge1, option_1, option_2); return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); } +diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c +index 00b4a4f083..a31d52f3ac 100644 +--- a/sysdeps/aarch64/fpu/atan2_advsimd.c ++++ b/sysdeps/aarch64/fpu/atan2_advsimd.c +@@ -19,40 +19,38 @@ + + #include "math_config.h" + #include "v_math.h" +-#include "poly_advsimd_f64.h" + + static const struct data + { ++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; +- double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; +- uint64x2_t zeroinfnan, minustwo; ++ uint64x2_t zeroinfnan; + } data = { +- /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on +- [2**-1022, 1.0]. */ +- .c0 = V2 (-0x1.5555555555555p-2), +- .c1 = 0x1.99999999996c1p-3, +- .c2 = V2 (-0x1.2492492478f88p-3), +- .c3 = 0x1.c71c71bc3951cp-4, +- .c4 = V2 (-0x1.745d160a7e368p-4), +- .c5 = 0x1.3b139b6a88ba1p-4, +- .c6 = V2 (-0x1.11100ee084227p-4), +- .c7 = 0x1.e1d0f9696f63bp-5, +- .c8 = V2 (-0x1.aebfe7b418581p-5), +- .c9 = 0x1.842dbe9b0d916p-5, +- .c10 = V2 (-0x1.5d30140ae5e99p-5), +- .c11 = 0x1.338e31eb2fbbcp-5, +- .c12 = V2 (-0x1.00e6eece7de8p-5), +- .c13 = 0x1.860897b29e5efp-6, +- .c14 = V2 (-0x1.0051381722a59p-6), +- .c15 = 0x1.14e9dc19a4a4ep-7, +- .c16 = V2 (-0x1.d0062b42fe3bfp-9), +- .c17 = 0x1.17739e210171ap-10, +- .c18 = V2 (-0x1.ab24da7be7402p-13), +- .c19 = 0x1.358851160a528p-16, ++ /* Coefficients of polynomial P such that ++ atan(x)~x+x*P(x^2) on [2^-1022, 1.0]. */ ++ .c0 = V2 (-0x1.555555555552ap-2), ++ .c1 = 0x1.9999999995aebp-3, ++ .c2 = V2 (-0x1.24924923923f6p-3), ++ .c3 = 0x1.c71c7184288a2p-4, ++ .c4 = V2 (-0x1.745d11fb3d32bp-4), ++ .c5 = 0x1.3b136a18051b9p-4, ++ .c6 = V2 (-0x1.110e6d985f496p-4), ++ .c7 = 0x1.e1bcf7f08801dp-5, ++ .c8 = V2 (-0x1.ae644e28058c3p-5), ++ .c9 = 0x1.82eeb1fed85c6p-5, ++ .c10 = V2 (-0x1.59d7f901566cbp-5), ++ .c11 = 0x1.2c982855ab069p-5, ++ .c12 = V2 (-0x1.eb49592998177p-6), ++ .c13 = 0x1.69d8b396e3d38p-6, ++ .c14 = V2 (-0x1.ca980345c4204p-7), ++ .c15 = 0x1.dc050eafde0b3p-8, ++ .c16 = V2 (-0x1.7ea70755b8eccp-9), ++ .c17 = 0x1.ba3da3de903e8p-11, ++ .c18 = V2 (-0x1.44a4b059b6f67p-13), ++ .c19 = 0x1.c4a45029e5a91p-17, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), +- .minustwo = V2 (0xc000000000000000), + }; + + #define SignMask v_u64 (0x8000000000000000) +@@ -77,10 +75,9 @@ zeroinfnan (uint64x2_t i, const struct data *d) + } + + /* Fast implementation of vector atan2. +- Maximum observed error is 2.8 ulps: +- _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) +- got 0x1.92d628ab678ccp-1 +- want 0x1.92d628ab678cfp-1. */ ++ Maximum observed error is 1.97 ulps: ++ _ZGVnN2vv_atan2 (0x1.42337dba73768p+5, 0x1.422d748cd3e29p+5) ++ got 0x1.9224810264efcp-1 want 0x1.9224810264efep-1. */ + float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +@@ -101,26 +98,29 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) + uint64x2_t pred_xlt0 = vcltzq_f64 (x); + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); + +- /* Set up z for call to atan. */ +- float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); +- float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); +- float64x2_t z = vdivq_f64 (n, q); +- +- /* Work out the correct shift. */ +- float64x2_t shift +- = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); +- shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); +- shift = vmulq_f64 (shift, d->pi_over_2); +- +- /* Calculate the polynomial approximation. +- Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of +- full scheme to avoid underflow in x^16. +- The order 19 polynomial P approximates +- (atan(sqrt(x))-sqrt(x))/x^(3/2). */ ++ /* Set up z for evaluation of atan. */ ++ float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); ++ float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax); ++ float64x2_t z = vdivq_f64 (num, den); ++ ++ /* Work out the correct shift for atan2: ++ Multiplication by pi is done later. ++ -pi when x < 0 and ax < ay ++ -pi/2 when x < 0 and ax > ay ++ 0 when x >= 0 and ax < ay ++ pi/2 when x >= 0 and ax > ay. */ ++ float64x2_t shift = vreinterpretq_f64_u64 ( ++ vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); ++ float64x2_t shift2 = vreinterpretq_f64_u64 ( ++ vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (1.0)))); ++ shift = vaddq_f64 (shift, shift2); ++ ++ /* Calculate the polynomial approximation. */ + float64x2_t z2 = vmulq_f64 (z, z); +- float64x2_t x2 = vmulq_f64 (z2, z2); +- float64x2_t x4 = vmulq_f64 (x2, x2); +- float64x2_t x8 = vmulq_f64 (x4, x4); ++ float64x2_t z3 = vmulq_f64 (z2, z); ++ float64x2_t z4 = vmulq_f64 (z2, z2); ++ float64x2_t z8 = vmulq_f64 (z4, z4); ++ float64x2_t z16 = vmulq_f64 (z8, z8); + + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); +@@ -128,45 +128,43 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + +- /* estrin_7. */ ++ /* Order-7 Estrin. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); +- float64x2_t p03 = vfmaq_f64 (p01, x2, p23); ++ float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); +- float64x2_t p47 = vfmaq_f64 (p45, x2, p67); ++ float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + +- float64x2_t p07 = vfmaq_f64 (p03, x4, p47); ++ float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + +- /* estrin_11. */ ++ /* Order-11 Estrin. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); +- float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); ++ float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); +- float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); ++ float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); +- float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); ++ float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819); + +- float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); +- float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); ++ float64x2_t p815 = vfmaq_f64 (p811, z8, p1215); ++ float64x2_t p819 = vfmaq_f64 (p815, z16, p1619); + +- float64x2_t ret = vfmaq_f64 (p07, p819, x8); ++ float64x2_t poly = vfmaq_f64 (p07, p819, z16); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ +- ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); +- ret = vaddq_f64 (ret, shift); ++ float64x2_t ret = vfmaq_f64 (z, shift, d->pi_over_2); ++ ret = vfmaq_f64 (ret, z3, poly); + + if (__glibc_unlikely (v_any_u64 (special_cases))) + return special_case (y, x, ret, sign_xy, special_cases); + + /* Account for the sign of x and y. */ +- ret = vreinterpretq_f64_u64 ( ++ return vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); +- +- return ret; + } +diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c +index 163f61308b..9e2dd249d4 100644 +--- a/sysdeps/aarch64/fpu/atan2_sve.c ++++ b/sysdeps/aarch64/fpu/atan2_sve.c +@@ -19,25 +19,25 @@ + + #include "math_config.h" + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct data + { +- float64_t poly[20]; +- float64_t pi_over_2; ++ float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; ++ float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ +- .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, +- 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, +- -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, +- 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, +- -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, +- 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, +- -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, +- .pi_over_2 = 0x1.921fb54442d18p+0, ++ .c0 = -0x1.555555555552ap-2, .c1 = 0x1.9999999995aebp-3, ++ .c2 = -0x1.24924923923f6p-3, .c3 = 0x1.c71c7184288a2p-4, ++ .c4 = -0x1.745d11fb3d32bp-4, .c5 = 0x1.3b136a18051b9p-4, ++ .c6 = -0x1.110e6d985f496p-4, .c7 = 0x1.e1bcf7f08801dp-5, ++ .c8 = -0x1.ae644e28058c3p-5, .c9 = 0x1.82eeb1fed85c6p-5, ++ .c10 = -0x1.59d7f901566cbp-5, .c11 = 0x1.2c982855ab069p-5, ++ .c12 = -0x1.eb49592998177p-6, .c13 = 0x1.69d8b396e3d38p-6, ++ .c14 = -0x1.ca980345c4204p-7, .c15 = 0x1.dc050eafde0b3p-8, ++ .c16 = -0x1.7ea70755b8eccp-9, .c17 = 0x1.ba3da3de903e8p-11, ++ .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17, + }; +- + /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ + static svfloat64_t NOINLINE + special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, +@@ -56,15 +56,17 @@ zeroinfnan (svuint64_t i, const svbool_t pg) + } + + /* Fast implementation of SVE atan2. Errors are greatest when y and +- x are reasonably close together. The greatest observed error is 2.28 ULP: +- _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) +- got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ +-svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) ++ x are reasonably close together. The greatest observed error is 1.94 ULP: ++ _ZGVsMxvv_atan2 (0x1.8a4bf7167228ap+5, 0x1.84971226bb57bp+5) ++ got 0x1.95db19dfef9ccp-1 want 0x1.95db19dfef9cep-1. */ ++svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, ++ const svbool_t pg) + { +- const struct data *data_ptr = ptr_barrier (&data); ++ const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); ++ svbool_t ptrue = svptrue_b64 (); + + svbool_t cmp_x = zeroinfnan (ix, pg); + svbool_t cmp_y = zeroinfnan (iy, pg); +@@ -81,32 +83,67 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) + + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); + +- /* Set up z for call to atan. */ +- svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); +- svfloat64_t d = svsel (pred_aygtax, ay, ax); +- svfloat64_t z = svdiv_x (pg, n, d); +- +- /* Work out the correct shift. */ ++ /* Set up z for evaluation of atan. */ ++ svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay); ++ svfloat64_t den = svsel (pred_aygtax, ay, ax); ++ svfloat64_t z = svdiv_x (pg, num, den); ++ ++ /* Work out the correct shift for atan2: ++ Multiplication by pi is done later. ++ -pi when x < 0 and ax < ay ++ -pi/2 when x < 0 and ax > ay ++ 0 when x >= 0 and ax < ay ++ pi/2 when x >= 0 and ax > ay. */ + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); ++ svfloat64_t shift_mul = svreinterpret_f64 ( ++ svorr_x (pg, sign_x, svreinterpret_u64 (sv_f64 (0x1.921fb54442d18p+0)))); + shift = svsel (pred_aygtax, sv_f64 (1.0), shift); +- shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); +- shift = svmul_x (pg, shift, data_ptr->pi_over_2); ++ shift = svmla_x (pg, z, shift, shift_mul); + + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); +- svfloat64_t x2 = svmul_x (pg, z2, z2); +- svfloat64_t x4 = svmul_x (pg, x2, x2); +- svfloat64_t x8 = svmul_x (pg, x4, x4); ++ svfloat64_t z3 = svmul_x (pg, z2, z); ++ svfloat64_t z4 = svmul_x (pg, z2, z2); ++ svfloat64_t z8 = svmul_x (pg, z4, z4); ++ svfloat64_t z16 = svmul_x (pg, z8, z8); + +- svfloat64_t ret = svmla_x ( +- pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly), +- sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8); ++ /* Order-7 Estrin. */ ++ svfloat64_t c13 = svld1rq (ptrue, &d->c1); ++ svfloat64_t c57 = svld1rq (ptrue, &d->c5); + +- /* y = shift + z + z^3 * P(z^2). */ +- svfloat64_t z3 = svmul_x (pg, z2, z); +- ret = svmla_x (pg, z, z3, ret); ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); ++ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); ++ ++ svfloat64_t p03 = svmla_x (pg, p01, z4, p23); ++ svfloat64_t p47 = svmla_x (pg, p45, z4, p67); ++ svfloat64_t p07 = svmla_x (pg, p03, z8, p47); ++ ++ /* Order-11 Estrin. */ ++ svfloat64_t c911 = svld1rq (ptrue, &d->c9); ++ svfloat64_t c1315 = svld1rq (ptrue, &d->c13); ++ svfloat64_t c1719 = svld1rq (ptrue, &d->c17); + +- ret = svadd_m (pg, ret, shift); ++ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); ++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); ++ svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); ++ ++ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0); ++ svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1); ++ svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415); ++ ++ svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0); ++ svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1); ++ svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819); ++ ++ svfloat64_t p815 = svmla_x (pg, p811, z8, p1215); ++ svfloat64_t p819 = svmla_x (pg, p815, z16, p1619); ++ ++ svfloat64_t poly = svmla_x (pg, p07, z16, p819); ++ ++ /* y = shift + z + z^3 * P(z^2). */ ++ svfloat64_t ret = svmla_x (pg, shift, z3, poly); + + /* Account for the sign of x and y. */ + if (__glibc_unlikely (svptest_any (pg, cmp_xy))) +diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c +index e65406f492..75d873897a 100644 +--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c ++++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c +@@ -18,22 +18,22 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f32.h" + + static const struct data + { +- float32x4_t c0, pi_over_2, c4, c6, c2; ++ float32x4_t c0, c4, c6, c2; + float c1, c3, c5, c7; + uint32x4_t comp_const; ++ float32x4_t pi; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ +- .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, +- .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, +- .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, +- .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, +- .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), ++ .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3, ++ .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, ++ .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, ++ .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, ++ .pi = V4 (0x1.921fb6p+1f), .comp_const = V4 (2 * 0x7f800000lu - 1), + }; + + #define SignMask v_u32 (0x80000000) +@@ -54,13 +54,13 @@ static inline uint32x4_t + zeroinfnan (uint32x4_t i, const struct data *d) + { + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ +- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); ++ return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const); + } + + /* Fast implementation of vector atan2f. Maximum observed error is +- 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: +- _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 +- want 0x1.967f00p-1. */ ++ 2.13 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: ++ _ZGVnN4vv_atan2f (0x1.14a9d4p-87, 0x1.0eb886p-87) got 0x1.97aea2p-1 ++ want 0x1.97ae9ep-1. */ + float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) + { + const struct data *d = ptr_barrier (&data); +@@ -81,28 +81,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) + uint32x4_t pred_xlt0 = vcltzq_f32 (x); + uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); + +- /* Set up z for call to atanf. */ +- float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); +- float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); +- float32x4_t z = vdivq_f32 (n, q); +- +- /* Work out the correct shift. */ ++ /* Set up z for evaluation of atanf. */ ++ float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); ++ float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax); ++ float32x4_t z = vdivq_f32 (num, den); ++ ++ /* Work out the correct shift for atan2: ++ Multiplication by pi is done later. ++ -pi when x < 0 and ax < ay ++ -pi/2 when x < 0 and ax > ay ++ 0 when x >= 0 and ax < ay ++ pi/2 when x >= 0 and ax > ay. */ + float32x4_t shift = vreinterpretq_f32_u32 ( +- vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); +- shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); +- shift = vmulq_f32 (shift, d->pi_over_2); +- +- /* Calculate the polynomial approximation. +- Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, +- a standard implementation using z8 creates spurious underflow +- in the very last fma (when z^8 is small enough). +- Therefore, we split the last fma into a mul and an fma. +- Horner and single-level Estrin have higher errors that exceed +- threshold. */ ++ vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f)))); ++ float32x4_t shift2 = vreinterpretq_f32_u32 ( ++ vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f)))); ++ shift = vaddq_f32 (shift, shift2); ++ ++ /* Calculate the polynomial approximation. */ + float32x4_t z2 = vmulq_f32 (z, z); ++ float32x4_t z3 = vmulq_f32 (z2, z); + float32x4_t z4 = vmulq_f32 (z2, z2); ++ float32x4_t z8 = vmulq_f32 (z4, z4); + + float32x4_t c1357 = vld1q_f32 (&d->c1); ++ + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); +@@ -110,10 +113,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + +- float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); ++ float32x4_t poly = vfmaq_f32 (p03, z8, p47); + + /* y = shift + z * P(z^2). */ +- ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); ++ float32x4_t ret = vfmaq_f32 (z, shift, d->pi); ++ ret = vfmaq_f32 (ret, z3, poly); + + if (__glibc_unlikely (v_any_u32 (special_cases))) + { +diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c +index 5f26e2a365..4d9341952d 100644 +--- a/sysdeps/aarch64/fpu/atan2f_sve.c ++++ b/sysdeps/aarch64/fpu/atan2f_sve.c +@@ -18,18 +18,18 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f32.h" + + static const struct data + { +- float32_t poly[8]; ++ float32_t c0, c2, c4, c6; ++ float32_t c1, c3, c5, c7; + float32_t pi_over_2; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ +- .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, +- -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, +- .pi_over_2 = 0x1.921fb6p+0f, ++ .c0 = -0x1.5554dcp-2, .c1 = 0x1.9978ecp-3, .c2 = -0x1.230a94p-3, ++ .c3 = 0x1.b4debp-4, .c4 = -0x1.3550dap-4, .c5 = 0x1.61eebp-5, ++ .c6 = -0x1.0c17d4p-6, .c7 = 0x1.7ea694p-9, .pi_over_2 = 0x1.921fb6p+0f, + }; + + /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ +@@ -51,12 +51,14 @@ zeroinfnan (svuint32_t i, const svbool_t pg) + + /* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * + P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum +- observed error is 2.95 ULP: +- _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 +- want 0x1.967f00p-1. */ +-svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) ++ observed error is 2.21 ULP: ++ _ZGVnN4vv_atan2f (0x1.a04aa8p+6, 0x1.9a274p+6) got 0x1.95ed3ap-1 ++ want 0x1.95ed36p-1. */ ++svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, ++ const svbool_t pg) + { +- const struct data *data_ptr = ptr_barrier (&data); ++ const struct data *d = ptr_barrier (&data); ++ svbool_t ptrue = svptrue_b32 (); + + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); +@@ -76,29 +78,42 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) + + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); + +- /* Set up z for call to atan. */ +- svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); +- svfloat32_t d = svsel (pred_aygtax, ay, ax); +- svfloat32_t z = svdiv_x (pg, n, d); +- +- /* Work out the correct shift. */ ++ /* Set up z for evaluation of atanf. */ ++ svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay); ++ svfloat32_t den = svsel (pred_aygtax, ay, ax); ++ svfloat32_t z = svdiv_x (ptrue, num, den); ++ ++ /* Work out the correct shift for atan2: ++ Multiplication by pi is done later. ++ -pi when x < 0 and ax < ay ++ -pi/2 when x < 0 and ax > ay ++ 0 when x >= 0 and ax < ay ++ pi/2 when x >= 0 and ax > ay. */ + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f32 (1.0), shift); + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); +- shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); + + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ +- svfloat32_t z2 = svmul_x (pg, z, z); ++ svfloat32_t z2 = svmul_x (ptrue, z, z); ++ svfloat32_t z3 = svmul_x (pg, z2, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); + +- svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly); ++ svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1); + +- /* ret = shift + z + z^3 * P(z^2). */ +- svfloat32_t z3 = svmul_x (pg, z2, z); +- ret = svmla_x (pg, z, z3, ret); ++ svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); ++ svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); ++ svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); ++ svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3); + +- ret = svadd_m (pg, ret, shift); ++ svfloat32_t p03 = svmla_x (pg, p01, z4, p23); ++ svfloat32_t p47 = svmla_x (pg, p45, z4, p67); ++ ++ svfloat32_t poly = svmla_x (pg, p03, z8, p47); ++ ++ /* ret = shift + z + z^3 * P(z^2). */ ++ svfloat32_t ret = svmla_x (pg, z, shift, sv_f32 (d->pi_over_2)); ++ ret = svmla_x (pg, ret, z3, poly); + + /* Account for the sign of x and y. */ + +diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c +index f024fd1d74..da0d3715df 100644 +--- a/sysdeps/aarch64/fpu/atan_advsimd.c ++++ b/sysdeps/aarch64/fpu/atan_advsimd.c +@@ -18,7 +18,6 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f64.h" + + static const struct data + { +@@ -28,16 +27,16 @@ static const struct data + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ +- .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, +- .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, +- .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, +- .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, +- .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, +- .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, +- .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, +- .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, +- .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, +- .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, ++ .c0 = V2 (-0x1.555555555552ap-2), .c1 = 0x1.9999999995aebp-3, ++ .c2 = V2 (-0x1.24924923923f6p-3), .c3 = 0x1.c71c7184288a2p-4, ++ .c4 = V2 (-0x1.745d11fb3d32bp-4), .c5 = 0x1.3b136a18051b9p-4, ++ .c6 = V2 (-0x1.110e6d985f496p-4), .c7 = 0x1.e1bcf7f08801dp-5, ++ .c8 = V2 (-0x1.ae644e28058c3p-5), .c9 = 0x1.82eeb1fed85c6p-5, ++ .c10 = V2 (-0x1.59d7f901566cbp-5), .c11 = 0x1.2c982855ab069p-5, ++ .c12 = V2 (-0x1.eb49592998177p-6), .c13 = 0x1.69d8b396e3d38p-6, ++ .c14 = V2 (-0x1.ca980345c4204p-7), .c15 = 0x1.dc050eafde0b3p-8, ++ .c16 = V2 (-0x1.7ea70755b8eccp-9), .c17 = 0x1.ba3da3de903e8p-11, ++ .c18 = V2 (-0x1.44a4b059b6f67p-13), .c19 = 0x1.c4a45029e5a91p-17, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + }; + +@@ -47,9 +46,9 @@ static const struct data + + /* Fast implementation of vector atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using +- z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: +- _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 +- want 0x1.9225645bdd7c3p-1. */ ++ z=1/x and shift = pi/2. Maximum observed error is 2.45 ulps: ++ _ZGVnN2v_atan (0x1.0008d737eb3e6p+0) got 0x1.92288c551a4c1p-1 ++ want 0x1.92288c551a4c3p-1. */ + float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +@@ -78,59 +77,53 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ +- uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); ++ uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ +- float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); ++ float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x); ++ + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); +- /* Use absolute value only when needed (odd powers of z). */ +- float64x2_t az = vbslq_f64 ( +- SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); +- +- /* Calculate the polynomial approximation. +- Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of +- full scheme to avoid underflow in x^16. +- The order 19 polynomial P approximates +- (atan(sqrt(x))-sqrt(x))/x^(3/2). */ ++ ++ /* Reinsert sign bit from argument into the shift value. */ ++ shift = vreinterpretq_f64_u64 ( ++ veorq_u64 (vreinterpretq_u64_f64 (shift), sign)); ++ ++ /* Calculate polynomial approximation P(z^2) with deg(P)=19. */ + float64x2_t z2 = vmulq_f64 (z, z); +- float64x2_t x2 = vmulq_f64 (z2, z2); +- float64x2_t x4 = vmulq_f64 (x2, x2); +- float64x2_t x8 = vmulq_f64 (x4, x4); ++ float64x2_t z4 = vmulq_f64 (z2, z2); ++ float64x2_t z8 = vmulq_f64 (z4, z4); ++ float64x2_t z16 = vmulq_f64 (z8, z8); + +- /* estrin_7. */ ++ /* Order-7 Estrin. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); +- float64x2_t p03 = vfmaq_f64 (p01, x2, p23); ++ float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); +- float64x2_t p47 = vfmaq_f64 (p45, x2, p67); ++ float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + +- float64x2_t p07 = vfmaq_f64 (p03, x4, p47); ++ float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + +- /* estrin_11. */ ++ /* Order-11 Estrin. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); +- float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); ++ float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); +- float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); ++ float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); +- float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); ++ float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819); + +- float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); +- float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); ++ float64x2_t p815 = vfmaq_f64 (p811, z8, p1215); ++ float64x2_t p819 = vfmaq_f64 (p815, z16, p1619); + +- float64x2_t y = vfmaq_f64 (p07, p819, x8); ++ float64x2_t y = vfmaq_f64 (p07, p819, z16); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ +- y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); +- y = vaddq_f64 (y, shift); +- +- /* y = atan(x) if x>0, -atan(-x) otherwise. */ +- y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); +- return y; ++ y = vfmsq_f64 (v_f64 (-1.0), z2, y); ++ return vfmsq_f64 (shift, z, y); + } +diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c +index 3880cedff4..a6b0489cf6 100644 +--- a/sysdeps/aarch64/fpu/atan_sve.c ++++ b/sysdeps/aarch64/fpu/atan_sve.c +@@ -18,23 +18,26 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct data + { +- float64_t poly[20]; +- float64_t pi_over_2; ++ float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; ++ float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; ++ float64_t shift_val, neg_one; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ +- .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, +- 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, +- -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, +- 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, +- -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, +- 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, +- -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, +- .pi_over_2 = 0x1.921fb54442d18p+0, ++ .c0 = -0x1.555555555552ap-2, .c1 = 0x1.9999999995aebp-3, ++ .c2 = -0x1.24924923923f6p-3, .c3 = 0x1.c71c7184288a2p-4, ++ .c4 = -0x1.745d11fb3d32bp-4, .c5 = 0x1.3b136a18051b9p-4, ++ .c6 = -0x1.110e6d985f496p-4, .c7 = 0x1.e1bcf7f08801dp-5, ++ .c8 = -0x1.ae644e28058c3p-5, .c9 = 0x1.82eeb1fed85c6p-5, ++ .c10 = -0x1.59d7f901566cbp-5, .c11 = 0x1.2c982855ab069p-5, ++ .c12 = -0x1.eb49592998177p-6, .c13 = 0x1.69d8b396e3d38p-6, ++ .c14 = -0x1.ca980345c4204p-7, .c15 = 0x1.dc050eafde0b3p-8, ++ .c16 = -0x1.7ea70755b8eccp-9, .c17 = 0x1.ba3da3de903e8p-11, ++ .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17, ++ .shift_val = 0x1.490fdaa22168cp+1, .neg_one = -1, + }; + + /* Useful constants. */ +@@ -43,15 +46,14 @@ static const struct data + /* Fast implementation of SVE atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed +- error is 2.27 ulps: +- _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 +- want 0x1.9225645bdd7c3p-1. */ ++ error is 2.08 ulps: ++ _ZGVsMxv_atan (0x1.000a7c56975e8p+0) got 0x1.922a3163e15c2p-1 ++ want 0x1.922a3163e15c4p-1. */ + svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); + +- /* No need to trigger special case. Small cases, infs and nans +- are supported by our approximation technique. */ ++ svbool_t ptrue = svptrue_b64 (); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t sign = svand_x (pg, ix, SignMask); + +@@ -59,32 +61,60 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ +- svbool_t red = svacgt (pg, x, 1.0); +- /* Avoid dependency in abs(x) in division (and comparison). */ +- svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x); +- /* Use absolute value only when needed (odd powers of z). */ +- svfloat64_t az = svabs_x (pg, z); +- az = svneg_m (az, red, az); ++ svbool_t red = svacgt (pg, x, d->neg_one); ++ svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x); ++ ++ /* Reuse of -1.0f to reduce constant loads, ++ We need a shift value of 1/2, which is created via -1 + (1 + 1/2). */ ++ svfloat64_t shift ++ = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val)); ++ ++ /* Reinserts the sign bit of the argument to handle the case of x < -1. */ ++ shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign)); + + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ +- svfloat64_t z2 = svmul_x (pg, z, z); +- svfloat64_t x2 = svmul_x (pg, z2, z2); +- svfloat64_t x4 = svmul_x (pg, x2, x2); +- svfloat64_t x8 = svmul_x (pg, x4, x4); ++ svfloat64_t z2 = svmul_x (ptrue, z, z); ++ svfloat64_t z4 = svmul_x (ptrue, z2, z2); ++ svfloat64_t z8 = svmul_x (ptrue, z4, z4); ++ svfloat64_t z16 = svmul_x (ptrue, z8, z8); + +- svfloat64_t y +- = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly), +- sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8); ++ /* Order-7 Estrin. */ ++ svfloat64_t c13 = svld1rq (ptrue, &d->c1); ++ svfloat64_t c57 = svld1rq (ptrue, &d->c5); + +- /* y = shift + z + z^3 * P(z^2). */ +- svfloat64_t z3 = svmul_x (pg, z2, az); +- y = svmla_x (pg, az, z3, y); ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); ++ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); ++ ++ svfloat64_t p03 = svmla_x (pg, p01, z4, p23); ++ svfloat64_t p47 = svmla_x (pg, p45, z4, p67); ++ svfloat64_t p07 = svmla_x (pg, p03, z8, p47); ++ ++ /* Order-11 Estrin. */ ++ svfloat64_t c911 = svld1rq (ptrue, &d->c9); ++ svfloat64_t c1315 = svld1rq (ptrue, &d->c13); ++ svfloat64_t c1719 = svld1rq (ptrue, &d->c17); + +- /* Apply shift as indicated by `red` predicate. */ +- y = svadd_m (red, y, d->pi_over_2); ++ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); ++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); ++ svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + +- /* y = atan(x) if x>0, -atan(-x) otherwise. */ +- y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); ++ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0); ++ svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1); ++ svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415); + +- return y; ++ svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0); ++ svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1); ++ svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819); ++ ++ svfloat64_t p815 = svmla_x (pg, p811, z8, p1215); ++ svfloat64_t p819 = svmla_x (pg, p815, z16, p1619); ++ ++ svfloat64_t y = svmla_x (pg, p07, z16, p819); ++ ++ /* y = shift + z + z^3 * P(z^2). */ ++ shift = svadd_m (red, z, shift); ++ y = svmul_x (pg, z2, y); ++ return svmla_x (pg, shift, z, y); + } +diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c +index 472865ed74..817a47ef3e 100644 +--- a/sysdeps/aarch64/fpu/atanf_advsimd.c ++++ b/sysdeps/aarch64/fpu/atanf_advsimd.c +@@ -22,26 +22,35 @@ + + static const struct data + { ++ uint32x4_t sign_mask, pi_over_2; ++ float32x4_t neg_one; ++#if WANT_SIMD_EXCEPT + float32x4_t poly[8]; +- float32x4_t pi_over_2; ++} data = { ++ .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3), ++ V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5), ++ V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) }, ++#else ++ float32x4_t c0, c2, c4, c6; ++ float c1, c3, c5, c7; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ +- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), +- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), +- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, +- .pi_over_2 = V4 (0x1.921fb6p+0f), ++ .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3, ++ .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, ++ .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, ++ .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, ++#endif ++ .pi_over_2 = V4 (0x3fc90fdb), ++ .neg_one = V4 (-1.0f), ++ .sign_mask = V4 (0x80000000), + }; + +-#define SignMask v_u32 (0x80000000) +- +-#define P(i) d->poly[i] +- ++#if WANT_SIMD_EXCEPT + #define TinyBound 0x30800000 /* asuint(0x1p-30). */ + #define BigBound 0x4e800000 /* asuint(0x1p30). */ + +-#if WANT_SIMD_EXCEPT + static float32x4_t VPCS_ATTR NOINLINE + special_case (float32x4_t x, float32x4_t y, uint32x4_t special) + { +@@ -51,19 +60,20 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) + + /* Fast implementation of vector atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] +- using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: +- _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ ++ using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps: ++ _ZGVnN4v_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1 ++ want 0x1.95ed36p-1. */ + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); + +- /* Small cases, infs and nans are supported by our approximation technique, +- but do not set fenv flags correctly. Only trigger special case if we need +- fenv. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); +- uint32x4_t sign = vandq_u32 (ix, SignMask); ++ uint32x4_t sign = vandq_u32 (ix, d->sign_mask); + + #if WANT_SIMD_EXCEPT ++ /* Small cases, infs and nans are supported by our approximation technique, ++ but do not set fenv flags correctly. Only trigger special case if we need ++ fenv. */ + uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); + uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), + v_u32 (BigBound - TinyBound)); +@@ -71,41 +81,52 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) + if (__glibc_unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (-1)); + #endif +- + /* Argument reduction: +- y := arctan(x) for x < 1 +- y := pi/2 + arctan(-1/x) for x > 1 +- Hence, use z=-1/a if x>=1, otherwise z=a. */ +- uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); +- /* Avoid dependency in abs(x) in division (and comparison). */ +- float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); ++ y := arctan(x) for |x| < 1 ++ y := arctan(-1/x) + pi/2 for x > +1 ++ y := arctan(-1/x) - pi/2 for x < -1 ++ Hence, use z=-1/a if x>=|-1|, otherwise z=a. */ ++ uint32x4_t red = vcagtq_f32 (x, d->neg_one); ++ ++ float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x); ++ ++ /* Shift is calculated as +-pi/2 or 0, depending on the argument case. */ + float32x4_t shift = vreinterpretq_f32_u32 ( +- vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); +- /* Use absolute value only when needed (odd powers of z). */ +- float32x4_t az = vbslq_f32 ( +- SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); ++ vandq_u32 (red, veorq_u32 (d->pi_over_2, sign))); ++ ++ float32x4_t z2 = vmulq_f32 (z, z); ++ float32x4_t z3 = vmulq_f32 (z, z2); ++ float32x4_t z4 = vmulq_f32 (z2, z2); ++#if WANT_SIMD_EXCEPT + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). +- Therefore, we split the last fma into a mul and an fma. +- Horner and single-level Estrin have higher errors that exceed +- threshold. */ +- float32x4_t z2 = vmulq_f32 (z, z); +- float32x4_t z4 = vmulq_f32 (z2, z2); +- ++ Therefore, we split the last fma into a mul and an fma. */ + float32x4_t y = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); + +- /* y = shift + z * P(z^2). */ +- y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); ++#else ++ float32x4_t z8 = vmulq_f32 (z4, z4); ++ ++ /* Uses an Estrin scheme for polynomial approximation. */ ++ float32x4_t odd_coeffs = vld1q_f32 (&d->c1); ++ ++ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, odd_coeffs, 0); ++ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, odd_coeffs, 1); ++ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, odd_coeffs, 2); ++ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, odd_coeffs, 3); + +- /* y = atan(x) if x>0, -atan(-x) otherwise. */ +- y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); ++ float32x4_t p03 = vfmaq_f32 (p01, z4, p23); ++ float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + +- return y; ++ float32x4_t y = vfmaq_f32 (p03, z8, p47); ++#endif ++ ++ /* y = shift + z * P(z^2). */ ++ return vfmaq_f32 (vaddq_f32 (shift, z), z3, y); + } + libmvec_hidden_def (V_NAME_F1 (atan)) + HALF_WIDTH_ALIAS_F1 (atan) +diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c +index 3a98d70c50..6558223e41 100644 +--- a/sysdeps/aarch64/fpu/atanf_sve.c ++++ b/sysdeps/aarch64/fpu/atanf_sve.c +@@ -18,18 +18,26 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f32.h" + + static const struct data + { +- float32_t poly[8]; +- float32_t pi_over_2; ++ float32_t c1, c3, c5, c7; ++ float32_t c0, c2, c4, c6; ++ float32_t shift_val, neg_one; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ +- .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, +- -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, +- .pi_over_2 = 0x1.921fb6p+0f, ++ .c0 = -0x1.5554dcp-2, ++ .c1 = 0x1.9978ecp-3, ++ .c2 = -0x1.230a94p-3, ++ .c3 = 0x1.b4debp-4, ++ .c4 = -0x1.3550dap-4, ++ .c5 = 0x1.61eebp-5, ++ .c6 = -0x1.0c17d4p-6, ++ .c7 = 0x1.7ea694p-9, ++ /* pi/2, used as a shift value after reduction. */ ++ .shift_val = 0x1.921fb54442d18p+0, ++ .neg_one = -1.0f, + }; + + #define SignMask (0x80000000) +@@ -37,43 +45,49 @@ static const struct data + /* Fast implementation of SVE atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=-1/x and shift = pi/2. +- Largest observed error is 2.9 ULP, close to +/-1.0: +- _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 +- want -0x1.967fp-1. */ ++ Largest observed error is 2.12 ULP: ++ _ZGVsMxv_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1 ++ want 0x1.95ed36p-1. */ + svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); ++ svbool_t ptrue = svptrue_b32 (); + + /* No need to trigger special case. Small cases, infs and nans + are supported by our approximation technique. */ + svuint32_t ix = svreinterpret_u32 (x); +- svuint32_t sign = svand_x (pg, ix, SignMask); ++ svuint32_t sign = svand_x (ptrue, ix, SignMask); + + /* Argument reduction: + y := arctan(x) for x < 1 +- y := pi/2 + arctan(-1/x) for x > 1 +- Hence, use z=-1/a if x>=1, otherwise z=a. */ +- svbool_t red = svacgt (pg, x, 1.0f); +- /* Avoid dependency in abs(x) in division (and comparison). */ +- svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x); +- /* Use absolute value only when needed (odd powers of z). */ +- svfloat32_t az = svabs_x (pg, z); +- az = svneg_m (az, red, az); +- +- /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ +- svfloat32_t z2 = svmul_x (pg, z, z); +- svfloat32_t z4 = svmul_x (pg, z2, z2); +- svfloat32_t z8 = svmul_x (pg, z4, z4); +- +- svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly); +- +- /* y = shift + z + z^3 * P(z^2). */ +- svfloat32_t z3 = svmul_x (pg, z2, az); +- y = svmla_x (pg, az, z3, y); +- +- /* Apply shift as indicated by 'red' predicate. */ +- y = svadd_m (red, y, sv_f32 (d->pi_over_2)); +- +- /* y = atan(x) if x>0, -atan(-x) otherwise. */ +- return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); ++ y := arctan(-1/x) + pi/2 for x > +1 ++ y := arctan(-1/x) - pi/2 for x < -1 ++ Hence, use z=-1/a if |x|>=|-1|, otherwise z=a. */ ++ svbool_t red = svacgt (pg, x, d->neg_one); ++ svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (d->neg_one), x), x); ++ ++ /* Reinserts the sign bit of the argument to handle the case of x < -1. */ ++ svfloat32_t shift = svreinterpret_f32 ( ++ sveor_x (red, svreinterpret_u32 (sv_f32 (d->shift_val)), sign)); ++ ++ svfloat32_t z2 = svmul_x (ptrue, z, z); ++ svfloat32_t z3 = svmul_x (ptrue, z2, z); ++ svfloat32_t z4 = svmul_x (ptrue, z2, z2); ++ svfloat32_t z8 = svmul_x (ptrue, z4, z4); ++ ++ svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1); ++ ++ svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); ++ svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); ++ svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); ++ svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3); ++ ++ svfloat32_t p03 = svmla_x (pg, p01, z4, p23); ++ svfloat32_t p47 = svmla_x (pg, p45, z4, p67); ++ ++ svfloat32_t y = svmla_x (pg, p03, z8, p47); ++ ++ /* shift + z + z^3 * P(z^2). */ ++ shift = svadd_m (red, z, shift); ++ return svmla_x (pg, shift, z3, y); + } +diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c +index 16a7cf6aa7..958d69a5f5 100644 +--- a/sysdeps/aarch64/fpu/atanh_sve.c ++++ b/sysdeps/aarch64/fpu/atanh_sve.c +@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) + } + + /* SVE approximation for double-precision atanh, based on log1p. +- The greatest observed error is 2.81 ULP: ++ The greatest observed error is 3.3 ULP: + _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ + svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) +@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); + + /* It is special if iax >= 1. */ +-// svbool_t special = svcmpge (pg, iax, One); + svbool_t special = svacge (pg, x, 1.0); + + /* Computation is performed based on the following sequence of equality: diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c -index ca44053535..77e58e123e 100644 +index ca44053535..f5a163b054 100644 --- a/sysdeps/aarch64/fpu/cosh_sve.c +++ b/sysdeps/aarch64/fpu/cosh_sve.c -@@ -23,7 +23,7 @@ static const struct data +@@ -21,69 +21,99 @@ + + static const struct data { - float64_t poly[3]; - float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; +- float64_t poly[3]; +- float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; - uint64_t index_mask, special_bound; ++ double c0, c2; ++ double c1, c3; ++ float64_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64_t special_bound; } data = { - .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, - 0x1.5555576a59599p-5, }, -@@ -35,14 +35,16 @@ static const struct data - .shift = 0x1.8p+52, - .thres = 704.0, - +- .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, +- 0x1.5555576a59599p-5, }, +- +- .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */ +- /* -ln2/N. */ +- .ln2_hi = -0x1.62e42fefa39efp-9, +- .ln2_lo = -0x1.abc9e3b39803f3p-64, +- .shift = 0x1.8p+52, +- .thres = 704.0, +- - .index_mask = 0xff, - /* 0x1.6p9, above which exp overflows. */ - .special_bound = 0x4086000000000000, +- /* 0x1.6p9, above which exp overflows. */ +- .special_bound = 0x4086000000000000, ++ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ ++ .c0 = 0x1.fffffffffdbcdp-2, ++ .c1 = 0x1.555555555444cp-3, ++ .c2 = 0x1.555573c6a9f7dp-5, ++ .c3 = 0x1.1111266d28935p-7, ++ .ln2_hi = 0x1.62e42fefa3800p-1, ++ .ln2_lo = 0x1.ef35793c76730p-45, ++ /* 1/ln2. */ ++ .inv_ln2 = 0x1.71547652b82fep+0, ++ .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022. */ ++ ++ /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows. */ ++ .special_bound = 0x40862e37e7d8ba72, }; - static svfloat64_t NOINLINE +-static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -+special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special) +-{ +- return sv_call_f64 (cosh, x, y, special); +-} +- +-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no +- special-case handling or tail. */ ++/* Helper for approximating exp(x)/2. ++ Functionally identical to FEXPA exp(x), but an adjustment in ++ the shift value which leads to a reduction in the exponent of scale by 1, ++ thus halving the result at no cost. */ + static inline svfloat64_t +-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) ++exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d) { -+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); -+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); -+ svfloat64_t y = svadd_x (pg, half_t, half_over_t); - return sv_call_f64 (cosh, x, y, special); - } + /* Calculate exp(x). */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); ++ svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); -@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +- svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi); +- r = svmla_x (pg, r, n, d->ln2_lo); ++ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); ++ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); - svuint64_t u = svreinterpret_u64 (z); - svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); +- svuint64_t u = svreinterpret_u64 (z); +- svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); - svuint64_t i = svand_x (pg, u, d->index_mask); -+ svuint64_t i = svand_x (svptrue_b64 (), u, 0xff); ++ svfloat64_t r = x; ++ r = svmls_lane (r, n, ln2, 0); ++ r = svmls_lane (r, n, ln2, 1); - svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); - y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); - y = svmla_x (pg, sv_f64 (1.0), r, y); +- svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); +- y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); +- y = svmla_x (pg, sv_f64 (1.0), r, y); - y = svmul_x (pg, r, y); -+ y = svmul_x (svptrue_b64 (), r, y); ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); ++ svfloat64_t p04 = svmla_x (pg, p01, p23, r2); ++ svfloat64_t p = svmla_x (pg, r, p04, r2); + +- /* s = 2^(n/N). */ +- u = svld1_gather_index (pg, __v_exp_tail_data, i); +- svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e)); ++ svfloat64_t scale = svexpa (u); + +- return svmla_x (pg, s, s, y); ++ return svmla_x (pg, scale, scale, p); ++} ++ ++/* Vectorised special case to handle values past where exp_inline overflows. ++ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double ++ the valid range of inputs, and returns inf for anything past that. */ ++static svfloat64_t NOINLINE ++special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t, ++ const struct data *d) ++{ ++ /* Finish fast path to compute values for non-special cases. */ ++ svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25); ++ svfloat64_t y = svadd_x (pg, t, inv_twoexp); ++ ++ /* Halves input value, and then check if any cases ++ are still going to overflow. */ ++ ax = svmul_x (special, ax, 0.5); ++ svbool_t is_safe ++ = svcmplt (special, svreinterpret_u64 (ax), d->special_bound); ++ ++ /* Computes exp(x/2), and sets any overflowing lanes to inf. */ ++ svfloat64_t half_exp = exp_over_two_inline (special, ax, d); ++ half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY)); ++ ++ /* Construct special case cosh(x) = (exp(x/2)^2)/2. */ ++ svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2); ++ svfloat64_t special_y = svmul_x (special, exp, half_exp); ++ ++ /* Select correct return values for special and non-special cases. */ ++ special_y = svsel (special, special_y, y); ++ ++ /* Ensure an input of nan is correctly propagated. */ ++ svbool_t is_nan ++ = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000)); ++ return svsel (is_nan, ax, svsel (special, special_y, y)); + } + + /* Approximation for SVE double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. +- The greatest observed error is in the scalar fall-back region, so is the +- same as the scalar routine, 1.93 ULP: +- _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021 +- want 0x1.fd774e958236fp+1021. +- +- The greatest observed error in the non-special region is 1.54 ULP: +- _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8 +- want 0x1.f5e2bb8d5c991p+8. */ ++ The greatest observed error in special case region is 2.66 + 0.5 ULP: ++ _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023 ++ want 0x1.f9b2d3d22399bp+1023 ++ ++ The greatest observed error in the non-special region is 1.01 + 0.5 ULP: ++ _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3 ++ want 0x1.890b225657f82p+3. */ + svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); +@@ -92,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) + svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound); - /* s = 2^(n/N). */ - u = svld1_gather_index (pg, __v_exp_tail_data, i); -@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) /* Up to the point that exp overflows, we can use it to calculate cosh by - exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ - svfloat64_t t = exp_inline (ax, pg, d); +- exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ +- svfloat64_t t = exp_inline (ax, pg, d); - svfloat64_t half_t = svmul_x (pg, t, 0.5); - svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); ++ (exp(|x|)/2 + 1) / (2 * exp(|x|)). */ ++ svfloat64_t half_exp = exp_over_two_inline (pg, ax, d); - /* Fall back to scalar for any special cases. */ +- /* Fall back to scalar for any special cases. */ ++ /* Falls back to entirely standalone vectorized special case. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svadd_x (pg, half_t, half_over_t), special); -+ return special_case (x, pg, t, special); ++ return special_case (pg, special, ax, half_exp, d); -+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); -+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); - return svadd_x (pg, half_t, half_over_t); +- return svadd_x (pg, half_t, half_over_t); ++ svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25); ++ return svadd_x (pg, half_exp, inv_twoexp); } +diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c +index fb8e06cf73..8056055418 100644 +--- a/sysdeps/aarch64/fpu/coshf_sve.c ++++ b/sysdeps/aarch64/fpu/coshf_sve.c +@@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e, + } + + /* Single-precision vector cosh, using vector expf. +- Maximum error is 2.77 ULP: +- _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2 +- want 0x1.e4594cp+2. */ ++ Maximum error is 2.56 +0.5 ULP: ++ _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2 ++ want 0x1.e4774ep+2. */ + svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) + { + const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c index 2743f9dbb5..b57ab514b7 100644 --- a/sysdeps/aarch64/fpu/erfcf_sve.c @@ -2767,25 +5078,118 @@ /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound multiplication may overflow, so use special case routine. */ +diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c +index 1a74db265c..f3e7f8b4f6 100644 +--- a/sysdeps/aarch64/fpu/exp10f_sve.c ++++ b/sysdeps/aarch64/fpu/exp10f_sve.c +@@ -19,26 +19,19 @@ + + #include "sv_math.h" + +-/* For x < -Thres, the result is subnormal and not handled correctly by +- FEXPA. */ +-#define Thres 37.9 ++/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled ++ correctly by FEXPA. */ ++#define Thres 0x1.2f702p+5 + + static const struct data + { +- float log2_10_lo, c0, c2, c4; +- float c1, c3, log10_2; +- float shift, log2_10_hi, thres; ++ float log10_2, log2_10_hi, log2_10_lo, c1; ++ float c0, shift, thres; + } data = { + /* Coefficients generated using Remez algorithm with minimisation of relative +- error. +- rel error: 0x1.89dafa3p-24 +- abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] +- maxerr: 0.52 +0.5 ulp. */ +- .c0 = 0x1.26bb16p+1f, +- .c1 = 0x1.5350d2p+1f, +- .c2 = 0x1.04744ap+1f, +- .c3 = 0x1.2d8176p+0f, +- .c4 = 0x1.12b41ap-1f, ++ error. */ ++ .c0 = 0x1.26bb62p1, ++ .c1 = 0x1.53524cp1, + /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ + .shift = 0x1.803f8p17f, + .log10_2 = 0x1.a934fp+1, +@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) + /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ +- +- svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo); ++ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2); + + /* n = round(x/(log10(2)/N)). */ + svfloat32_t shift = sv_f32 (d->shift); +- svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift); +- svfloat32_t n = svsub_x (svptrue_b32 (), z, shift); ++ svfloat32_t z = svmla_lane (shift, x, lane_consts, 0); ++ svfloat32_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ +- svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x); +- r = svmls_lane (r, n, lane_consts, 0); ++ svfloat32_t r = x; ++ r = svmls_lane (r, n, lane_consts, 1); ++ r = svmls_lane (r, n, lane_consts, 2); + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ +- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); +- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); +- svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); +- svfloat32_t p14 = svmla_x (pg, p12, p34, r2); +- svfloat32_t p0 = svmul_lane (r, lane_consts, 1); +- svfloat32_t poly = svmla_x (pg, p0, r2, p14); +- ++ svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3); ++ poly = svmul_x (pg, poly, r); + return svmla_x (pg, scale, scale, poly); + } + +@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d) + special); + } + +-/* Single-precision SVE exp10f routine. Implements the same algorithm +- as AdvSIMD exp10f. +- Worst case error is 1.02 ULPs. +- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 +- want 0x1.ba5f9cp-1. */ ++/* Single-precision SVE exp10f routine. Based on the FEXPA instruction. ++ Worst case error is 1.10 ULP. ++ _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47 ++ want 0x1.be017p+47. */ + svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c -index a37c33092a..6db85266ca 100644 +index a37c33092a..c135852532 100644 --- a/sysdeps/aarch64/fpu/exp2_sve.c +++ b/sysdeps/aarch64/fpu/exp2_sve.c -@@ -18,7 +18,6 @@ +@@ -18,25 +18,22 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" +- +-#define N (1 << V_EXP_TABLE_BITS) - #define N (1 << V_EXP_TABLE_BITS) - -@@ -27,15 +26,15 @@ + #define BigBound 1022 + #define UOFlowBound 1280 static const struct data { - double poly[4]; -+ double c0, c2; -+ double c1, c3; ++ double c2, c4; ++ double c0, c1, c3; double shift, big_bound, uoflow_bound; } data = { /* Coefficients are computed using Remez algorithm with @@ -2794,15 +5198,20 @@ - 0x1.3b2abf5571ad8p-7 }, - .shift = 0x1.8p52 / N, - .uoflow_bound = UOFlowBound, -+ .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3, -+ .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7, -+ .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound, - .big_bound = BigBound, +- .big_bound = BigBound, ++ .c0 = 0x1.62e42fefa39efp-1, .c1 = 0x1.ebfbdff82a31bp-3, ++ .c2 = 0x1.c6b08d706c8a5p-5, .c3 = 0x1.3b2ad2ff7d2f3p-7, ++ .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46, ++ .uoflow_bound = UOFlowBound, .big_bound = BigBound, }; -@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + #define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +@@ -65,47 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + /* |n| > 1280 => 2^(n) overflows. */ - svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); +- svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); ++ svbool_t p_cmp = svacle (pg, n, d->uoflow_bound); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); @@ -2810,27 +5219,140 @@ - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); - return svsel (p_cmp, r1, r0); +- return svsel (p_cmp, r1, r0); ++ return svsel (p_cmp, r0, r1); } -@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) - svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); - svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); -+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + /* Fast vector implementation of exp2. +- Maximum measured error is 1.65 ulp. +- _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 +- want 0x1.f8db0d4df721dp-1. */ ++ Maximum measured error is 0.52 + 0.5 ulp. ++ _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0 ++ want 0x1.8861641b49e07p+0. */ + svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) + { + const struct data *d = ptr_barrier (&data); +- svbool_t no_big_scale = svacle (pg, x, d->big_bound); +- svbool_t special = svnot_z (pg, no_big_scale); +- +- /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */ +- svfloat64_t shift = sv_f64 (d->shift); +- svfloat64_t kd = svadd_x (pg, x, shift); +- svuint64_t ki = svreinterpret_u64 (kd); +- /* kd = k/N. */ +- kd = svsub_x (pg, kd, shift); +- svfloat64_t r = svsub_x (pg, x, kd); +- +- /* scale ~= 2^(k/N). */ +- svuint64_t idx = svand_x (pg, ki, N - 1); +- svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx); +- /* This is only a valid scale when -1023*N < k < 1024*N. */ +- svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); +- svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); ++ svbool_t special = svacge (pg, x, d->big_bound); ++ ++ svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift); ++ svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift); ++ svfloat64_t r = svsub_x (svptrue_b64 (), x, n); ++ ++ svfloat64_t scale = svexpa (svreinterpret_u64 (z)); ++ ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); + /* Approximate exp2(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); - svfloat64_t y = svmul_x (pg, r, p); -- -+ /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ -+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); -+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); -+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); -+ svfloat64_t p = svmla_x (pg, p01, p23, r2); ++ /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4). */ ++ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); ++ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); ++ svfloat64_t p = svmla_x (pg, p12, p34, r2); ++ p = svmad_x (pg, p, r, d->c0); + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); + /* Assemble exp2(x) = exp2(r) * scale. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (pg, scale, y, kd, d); +- return special_case (pg, scale, y, kd, d); ++ { ++ /* FEXPA zeroes the sign bit, however the sign is meaningful to the ++ special case function so needs to be copied. ++ e = sign bit of u << 46. */ ++ svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46), ++ 0x8000000000000000); ++ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); ++ return special_case (pg, scale, y, n, d); ++ } ++ + return svmla_x (pg, scale, scale, y); + } +diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c +index fcd7830164..989cefb605 100644 +--- a/sysdeps/aarch64/fpu/exp2f_sve.c ++++ b/sysdeps/aarch64/fpu/exp2f_sve.c +@@ -18,21 +18,17 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f32.h" + + #define Thres 0x1.5d5e2ap+6f + + static const struct data + { +- float c0, c2, c4, c1, c3; +- float shift, thres; ++ float c0, c1, shift, thres; + } data = { +- /* Coefficients copied from the polynomial in AdvSIMD variant. */ +- .c0 = 0x1.62e422p-1f, +- .c1 = 0x1.ebf9bcp-3f, +- .c2 = 0x1.c6bd32p-5f, +- .c3 = 0x1.3ce9e4p-7f, +- .c4 = 0x1.59977ap-10f, ++ /* Coefficients generated using Remez algorithm with minimisation of relative ++ error. */ ++ .c0 = 0x1.62e485p-1, ++ .c1 = 0x1.ebfbe0p-3, + /* 1.5*2^17 + 127. */ + .shift = 0x1.803f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled +@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + +- /* Polynomial evaluation: poly(r) ~ exp2(r)-1. +- Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for +- coefficients 1 to 4, and apply most significant coefficient directly. */ +- svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0); +- svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); +- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1); +- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2); +- svfloat32_t p14 = svmla_x (pg, p12, r2, p34); +- svfloat32_t p0 = svmul_lane (r, even_coeffs, 0); +- svfloat32_t poly = svmla_x (pg, p0, r2, p14); ++ svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1)); ++ poly = svmul_x (svptrue_b32 (), poly, r); + + return svmla_x (pg, scale, scale, poly); + } +@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d) + special); + } + +-/* Single-precision SVE exp2f routine. Implements the same algorithm +- as AdvSIMD exp2f. +- Worst case error is 1.04 ULPs. +- _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1 +- want 0x1.ba6a64p-1. */ ++/* Single-precision SVE exp2f routine, based on the FEXPA instruction. ++ Worst case error is 1.09 ULPs. ++ _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0 ++ want 0x1.be1052p+0. */ + svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c index 37de751f90..dc049482ed 100644 --- a/sysdeps/aarch64/fpu/exp_sve.c @@ -2913,6 +5435,377 @@ svfloat64_t p04 = svmla_x (pg, p01, p23, r2); svfloat64_t y = svmla_x (pg, r, p04, r2); +diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c +index f9249db8b6..c3619975b3 100644 +--- a/sysdeps/aarch64/fpu/expf_sve.c ++++ b/sysdeps/aarch64/fpu/expf_sve.c +@@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d) + } + + /* Optimised single-precision SVE exp function. +- Worst-case error is 1.04 ulp: +- SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 +- want 0x1.ba74bap+4. */ ++ Worst-case error is 0.88 +0.50 ULP: ++ _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1 ++ want 0x1.f2528ap-1. */ + svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); +diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c +index d4ba8ccf3b..b1d940bd20 100644 +--- a/sysdeps/aarch64/fpu/expm1_sve.c ++++ b/sysdeps/aarch64/fpu/expm1_sve.c +@@ -18,82 +18,164 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f64.h" + +-#define SpecialBound 0x1.62b7d369a5aa9p+9 +-#define ExponentBias 0x3ff0000000000000 ++#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64. */ ++#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)). */ + + static const struct data + { +- double poly[11]; +- double shift, inv_ln2, special_bound; +- /* To be loaded in one quad-word. */ ++ double c2, c4; ++ double inv_ln2; + double ln2_hi, ln2_lo; ++ double c0, c1, c3; ++ double shift, thres; ++ uint64_t expm1_data[32]; + } data = { +- /* Generated using fpminimax. */ +- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, +- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13, +- 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, +- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, +- +- .special_bound = SpecialBound, +- .inv_ln2 = 0x1.71547652b82fep0, +- .ln2_hi = 0x1.62e42fefa39efp-1, +- .ln2_lo = 0x1.abc9e3b39803fp-56, +- .shift = 0x1.8p52, ++ /* Table emulating FEXPA - 1, for values of FEXPA close to 1. ++ The table holds values of 2^(i/64) - 1, computed in arbitrary precision. ++ The first half of the table stores values associated to i from 0 to 15. ++ The second half of the table stores values associated to i from 0 to -15. */ ++ .expm1_data = { ++ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901, ++ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, ++ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, ++ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a, ++ 0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, ++ 0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424, ++ 0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e, ++ 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4, ++ }, ++ ++ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ ++ .c0 = 0x1p-1, ++ .c1 = 0x1.55555555548f9p-3, ++ .c2 = 0x1.5555555554c22p-5, ++ .c3 = 0x1.111123aaa2fb2p-7, ++ .c4 = 0x1.6c16d77d98e5bp-10, ++ .ln2_hi = 0x1.62e42fefa3800p-1, ++ .ln2_lo = 0x1.ef35793c76730p-45, ++ .inv_ln2 = 0x1.71547652b82fep+0, ++ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */ ++ .thres = SpecialBound, + }; + +-static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg) ++#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ ++/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ ++#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ ++#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ ++ ++static NOINLINE svfloat64_t ++special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p, ++ svfloat64_t n) + { +- return sv_call_f64 (expm1, x, y, pg); ++ /* s=2^n may overflow, break it up into s=s1*s2, ++ such that exp = s + s*y can be computed as s1*(s2+s2*y) ++ and s1*s1 overflows only if n>0. */ ++ ++ /* If n<=0 then set b to 0x6, 0 otherwise. */ ++ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ ++ svuint64_t b ++ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ ++ ++ /* Set s1 to generate overflow depending on sign of exponent n, ++ ie. s1 = 0x70...0 - b. */ ++ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); ++ /* Offset s to avoid overflow in final result if n is below threshold. ++ ie. s2 = as_u64 (s) - 0x3010...0 + b. */ ++ svfloat64_t s2 = svreinterpret_f64 ( ++ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); ++ ++ /* |n| > 1280 => 2^(n) overflows. */ ++ svbool_t p_cmp = svacgt (pg, n, 1280.0); ++ ++ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); ++ svfloat64_t r2 = svmla_x (pg, s2, s2, p); ++ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); ++ ++ svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */ ++ return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0)); + } + +-/* Double-precision vector exp(x) - 1 function. +- The maximum error observed error is 2.18 ULP: +- _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 +- want 0x1.a8b9ea8d66e2p-2. */ ++/* FEXPA based SVE expm1 algorithm. ++ Maximum measured error is 2.81 + 0.5 ULP: ++ _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3 ++ want 0x1.c290e5858bb5p-3. */ + svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg) + { + const struct data *d = ptr_barrier (&data); + +- /* Large, Nan/Inf. */ +- svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); +- +- /* Reduce argument to smaller range: +- Let i = round(x / ln2) +- and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. +- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 +- where 2^i is exact because i is an integer. */ +- svfloat64_t shift = sv_f64 (d->shift); +- svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift); +- svint64_t i = svcvt_s64_x (pg, n); +- svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); +- svfloat64_t f = svmls_lane (x, n, ln2, 0); +- f = svmls_lane (f, n, ln2, 1); +- +- /* Approximate expm1(f) using polynomial. +- Taylor expansion for expm1(x) has the form: +- x + ax^2 + bx^3 + cx^4 .... +- So we calculate the polynomial P(f) = a + bf + cf^2 + ... +- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ +- svfloat64_t f2 = svmul_x (pg, f, f); +- svfloat64_t f4 = svmul_x (pg, f2, f2); +- svfloat64_t f8 = svmul_x (pg, f4, f4); +- svfloat64_t p +- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); +- +- /* Assemble the result. +- expm1(x) ~= 2^i * (p + 1) - 1 +- Let t = 2^i. */ +- svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias); +- svfloat64_t t = svreinterpret_f64 (u); +- +- /* expm1(x) ~= p * t + (t - 1). */ +- svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t); ++ svbool_t special = svacgt (pg, x, d->thres); + +- if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (x, y, special); ++ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); ++ svuint64_t u = svreinterpret_u64 (z); ++ svfloat64_t n = svsub_x (pg, z, d->shift); + ++ /* r = x - n * ln2, r is in [-ln2/128, ln2/128]. */ ++ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); ++ svfloat64_t r = x; ++ r = svmls_lane (r, n, ln2, 0); ++ r = svmls_lane (r, n, ln2, 1); ++ ++ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); ++ ++ svfloat64_t p; ++ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); ++ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); ++ p = svmad_x (pg, c34, r2, c12); ++ p = svmad_x (pg, p, r, sv_f64 (d->c0)); ++ p = svmad_x (pg, p, r2, r); ++ ++ svfloat64_t scale = svexpa (u); ++ svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0)); ++ ++ /* We want to construct expm1(x) = (scale - 1) + scale * poly. ++ However, for values of scale close to 1, scale-1 causes large ULP errors ++ due to cancellation. ++ ++ This can be circumvented by using a small lookup for scale-1 ++ when our input is below a certain bound, otherwise we can use FEXPA. ++ ++ This bound is based upon the table size: ++ Bound = (TableSize-1/64) * ln2. ++ The current bound is based upon a table size of 16. */ ++ svbool_t is_small = svaclt (pg, x, FexpaBound); ++ ++ if (svptest_any (pg, is_small)) ++ { ++ /* Index via the input of FEXPA, but we only care about the lower 4 bits. ++ */ ++ svuint64_t base_idx = svand_x (pg, u, 0xf); ++ ++ /* We can use the sign of x as a fifth bit to account for the asymmetry ++ of e^x around 0. */ ++ svuint64_t signBit ++ = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4); ++ svuint64_t idx = svorr_x (pg, base_idx, signBit); ++ ++ /* Lookup values for scale - 1 for small x. */ ++ svfloat64_t lookup = svreinterpret_f64 ( ++ svld1_gather_index (is_small, d->expm1_data, idx)); ++ ++ /* Select the appropriate scale - 1 value based on x. */ ++ scalem1 = svsel (is_small, lookup, scalem1); ++ } ++ ++ svfloat64_t y = svmla_x (pg, scalem1, scale, p); ++ ++ /* FEXPA returns nan for large inputs so we special case those. */ ++ if (__glibc_unlikely (svptest_any (pg, special))) ++ { ++ /* FEXPA zeroes the sign bit, however the sign is meaningful to the ++ special case function so needs to be copied. ++ e = sign bit of u << 46. */ ++ svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); ++ /* Copy sign to s. */ ++ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); ++ return special_case (pg, y, scale, p, n); ++ } ++ ++ /* return expm1 = (scale - 1) + (scale * poly). */ + return y; + } +diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c +index 862c13f811..821c0780a9 100644 +--- a/sysdeps/aarch64/fpu/log1p_sve.c ++++ b/sysdeps/aarch64/fpu/log1p_sve.c +@@ -22,19 +22,33 @@ + + static const struct data + { +- double poly[19]; ++ float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16; ++ float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; + double ln2_hi, ln2_lo; + uint64_t hfrt2_top, onemhfrt2_top, inf, mone; + } data = { + /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20 +- polynomial, however first 2 coefficients are 0 and 1 so are not stored. */ +- .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, +- 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, +- -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, +- 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, +- -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, +- 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, +- -0x1.cfa7385bdb37ep-6, }, ++ polynomial, however first 2 coefficients are 0 and 1 so are not ++ stored. */ ++ .c0 = -0x1.ffffffffffffbp-2, ++ .c1 = 0x1.55555555551a9p-2, ++ .c2 = -0x1.00000000008e3p-2, ++ .c3 = 0x1.9999999a32797p-3, ++ .c4 = -0x1.555555552fecfp-3, ++ .c5 = 0x1.249248e071e5ap-3, ++ .c6 = -0x1.ffffff8bf8482p-4, ++ .c7 = 0x1.c71c8f07da57ap-4, ++ .c8 = -0x1.9999ca4ccb617p-4, ++ .c9 = 0x1.7459ad2e1dfa3p-4, ++ .c10 = -0x1.554d2680a3ff2p-4, ++ .c11 = 0x1.3b4c54d487455p-4, ++ .c12 = -0x1.2548a9ffe80e6p-4, ++ .c13 = 0x1.0f389a24b2e07p-4, ++ .c14 = -0x1.eee4db15db335p-5, ++ .c15 = 0x1.e95b494d4a5ddp-5, ++ .c16 = -0x1.15fdf07cb7c73p-4, ++ .c17 = 0x1.0310b70800fcfp-4, ++ .c18 = -0x1.cfa7385bdb37ep-6, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* top32(asuint64(sqrt(2)/2)) << 32. */ +@@ -49,7 +63,7 @@ static const struct data + #define BottomMask 0xffffffff + + static svfloat64_t NOINLINE +-special_case (svbool_t special, svfloat64_t x, svfloat64_t y) ++special_case (svfloat64_t x, svfloat64_t y, svbool_t special) + { + return sv_call_f64 (log1p, x, y, special); + } +@@ -91,8 +105,9 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top); +- svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask)); +- svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); ++ svuint64_t u_red ++ = svorr_x (pg, utop, svand_x (svptrue_b64 (), mi, BottomMask)); ++ svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m); +@@ -103,18 +118,49 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... + Assembling this all correctly is dealt with at the final step. */ +- svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2), +- f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8); +- svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly); ++ svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f), ++ f4 = svmul_x (svptrue_b64 (), f2, f2), ++ f8 = svmul_x (svptrue_b64 (), f4, f4), ++ f16 = svmul_x (svptrue_b64 (), f8, f8); ++ ++ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); ++ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); ++ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); ++ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); ++ svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17); ++ ++ /* Order-18 Estrin scheme. */ ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1); ++ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1); ++ ++ svfloat64_t p03 = svmla_x (pg, p01, f2, p23); ++ svfloat64_t p47 = svmla_x (pg, p45, f2, p67); ++ svfloat64_t p07 = svmla_x (pg, p03, f4, p47); ++ ++ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0); ++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1); ++ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0); ++ svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1); ++ ++ svfloat64_t p811 = svmla_x (pg, p89, f2, p1011); ++ svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415); ++ svfloat64_t p815 = svmla_x (pg, p811, f4, p1215); ++ ++ svfloat64_t p015 = svmla_x (pg, p07, f8, p815); ++ svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0); ++ svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1); ++ svfloat64_t p = svmla_x (pg, p015, f16, p1618); + + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi); +- svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); + + if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (special, x, y); +- +- return y; ++ return special_case ( ++ x, svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p), ++ special); ++ return svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p); + } + + strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1)) diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c index 42d551ca92..b8c1b39dca 100644 --- a/sysdeps/aarch64/fpu/pow_sve.c @@ -3533,25 +6426,601 @@ return ret; } +diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c +index 963453f812..072ba8fca9 100644 +--- a/sysdeps/aarch64/fpu/sinh_sve.c ++++ b/sysdeps/aarch64/fpu/sinh_sve.c +@@ -18,90 +18,153 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct data + { +- float64_t poly[11]; +- float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift; + uint64_t halff; +- int64_t onef; +- uint64_t large_bound; ++ double c2, c4; ++ double inv_ln2; ++ double ln2_hi, ln2_lo; ++ double c0, c1, c3; ++ double shift, special_bound, bound; ++ uint64_t expm1_data[20]; + } data = { +- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ +- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, +- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, +- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, +- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, +- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, +- +- .inv_ln2 = 0x1.71547652b82fep0, +- .m_ln2_hi = -0x1.62e42fefa39efp-1, +- .m_ln2_lo = -0x1.abc9e3b39803fp-56, +- .shift = 0x1.8p52, +- ++ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */ ++ .expm1_data = { ++ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901, ++ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, ++ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, ++ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a, ++ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7, ++ }, ++ ++ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ ++ .c0 = 0x1p-1, ++ .c1 = 0x1.55555555548f9p-3, ++ .c2 = 0x1.5555555554c22p-5, ++ .c3 = 0x1.111123aaa2fb2p-7, ++ .c4 = 0x1.6c16d77d98e5bp-10, ++ .ln2_hi = 0x1.62e42fefa3800p-1, ++ .ln2_lo = 0x1.ef35793c76730p-45, ++ .inv_ln2 = 0x1.71547652b82fep+0, ++ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */ + .halff = 0x3fe0000000000000, +- .onef = 0x3ff0000000000000, +- /* 2^9. expm1 helper overflows for large input. */ +- .large_bound = 0x4080000000000000, ++ .special_bound = 0x1.62e37e7d8ba72p+9, /* ln(2^(1024 - 1/128)). */ ++ .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64. */ + }; + ++/* A specialised FEXPA expm1 that is only valid for positive inputs and ++ has no special cases. Based off the full FEXPA expm1 implementated for ++ _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP. */ + static inline svfloat64_t +-expm1_inline (svfloat64_t x, svbool_t pg) ++expm1_inline (svbool_t pg, svfloat64_t x) + { + const struct data *d = ptr_barrier (&data); + +- /* Reduce argument: +- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 +- where i = round(x / ln2) +- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ +- svfloat64_t j +- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); +- svint64_t i = svcvt_s64_x (pg, j); +- svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi); +- f = svmla_x (pg, f, j, d->m_ln2_lo); +- /* Approximate expm1(f) using polynomial. */ +- svfloat64_t f2 = svmul_x (pg, f, f); +- svfloat64_t f4 = svmul_x (pg, f2, f2); +- svfloat64_t f8 = svmul_x (pg, f4, f4); +- svfloat64_t p +- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); +- /* t = 2^i. */ +- svfloat64_t t = svscale_x (pg, sv_f64 (1), i); +- /* expm1(x) ~= p * t + (t - 1). */ +- return svmla_x (pg, svsub_x (pg, t, 1.0), p, t); ++ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); ++ svuint64_t u = svreinterpret_u64 (z); ++ svfloat64_t n = svsub_x (pg, z, d->shift); ++ ++ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); ++ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); ++ ++ svfloat64_t r = x; ++ r = svmls_lane (r, n, ln2, 0); ++ r = svmls_lane (r, n, ln2, 1); ++ ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ ++ svfloat64_t p; ++ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); ++ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); ++ p = svmad_x (pg, c34, r2, c12); ++ p = svmad_x (pg, p, r, sv_f64 (d->c0)); ++ p = svmad_x (pg, p, r2, r); ++ ++ svfloat64_t scale = svexpa (u); ++ ++ /* We want to construct expm1(x) = (scale - 1) + scale * poly. ++ However, for values of scale close to 1, scale-1 causes large ULP errors ++ due to cancellation. ++ ++ This can be circumvented by using a small lookup for scale-1 ++ when our input is below a certain bound, otherwise we can use FEXPA. */ ++ svbool_t is_small = svaclt (pg, x, d->bound); ++ ++ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */ ++ svuint64_t base_idx = svand_x (pg, u, 0x1f); ++ ++ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */ ++ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0)); ++ svuint64_t scalem1_lookup ++ = svld1_gather_index (is_small, d->expm1_data, base_idx); ++ ++ /* Select the appropriate scale - 1 value based on x. */ ++ svfloat64_t scalem1 ++ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate); ++ ++ /* return expm1 = scale - 1 + (scale * poly). */ ++ return svmla_x (pg, scalem1, scale, p); + } + ++/* Vectorised special case to handle values past where exp_inline overflows. ++ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double ++ the valid range of inputs, and returns inf for anything past that. */ + static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svbool_t pg) ++special_case (svbool_t pg, svbool_t special, svfloat64_t ax, ++ svfloat64_t halfsign, const struct data *d) + { +- return sv_call_f64 (sinh, x, x, pg); ++ /* Halves input value, and then check if any cases ++ are still going to overflow. */ ++ ax = svmul_x (special, ax, 0.5); ++ svbool_t is_safe = svaclt (special, ax, d->special_bound); ++ ++ svfloat64_t t = expm1_inline (pg, ax); ++ ++ /* Finish fastpass to compute values for non-special cases. */ ++ svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); ++ y = svmul_x (pg, y, halfsign); ++ ++ /* Computes special lane, and set remaining overflow lanes to inf. */ ++ svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign); ++ svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t); ++ ++ svuint64_t signed_inf ++ = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign), ++ sv_u64 (0x7ff0000000000000)); ++ special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf)); ++ ++ /* Join resulting vectors together and return. */ ++ return svsel (special, special_y, y); + } + +-/* Approximation for SVE double-precision sinh(x) using expm1. +- sinh(x) = (exp(x) - exp(-x)) / 2. +- The greatest observed error is 2.57 ULP: +- _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2 +- want 0x1.ab929fc64bd63p-2. */ ++/* Approximation for SVE double-precision sinh(x) using FEXPA expm1. ++ Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy. ++ The greatest observed error in the non-special region is 2.63 + 0.5 ULP: ++ _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2 ++ want 0x1.c3587faf97b09p-2 ++ ++ The greatest observed error in the special region is 2.65 + 0.5 ULP: ++ _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023 ++ want 0x1.fffd30eea0063p+1023. */ + svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg) + { + const struct data *d = ptr_barrier (&data); + ++ svbool_t special = svacge (pg, x, d->special_bound); + svfloat64_t ax = svabs_x (pg, x); + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff)); + +- svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound); +- + /* Fall back to scalar variant for all lanes if any are special. */ + if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (x, pg); ++ return special_case (pg, special, ax, halfsign, d); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ +- svfloat64_t t = expm1_inline (ax, pg); ++ svfloat64_t t = expm1_inline (pg, ax); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + return svmul_x (pg, t, halfsign); + } diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h -index f208d33896..16b81fc738 100644 +index f208d33896..e2d2e906bd 100644 --- a/sysdeps/aarch64/fpu/sv_expf_inline.h +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h -@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +@@ -24,52 +24,41 @@ + + struct sv_expf_data + { +- float c1, c3, inv_ln2; +- float ln2_lo, c0, c2, c4; +- float ln2_hi, shift; ++ float ln2_hi, ln2_lo, c1, null; ++ float inv_ln2, shift; + }; + +-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for +- compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ ++/* Shift is 1.5*2^17 + 127. */ + #define SV_EXPF_DATA \ + { \ +- /* Coefficients copied from the polynomial in AdvSIMD variant. */ \ +- .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \ +- .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \ +- .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ +- .shift = 0x1.803f8p17f, \ ++ .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ ++ .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ + } + +-#define C(i) sv_f32 (d->poly[i]) +- + static inline svfloat32_t + expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) + { + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + +- svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo); ++ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi); + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift); + svfloat32_t n = svsub_x (pg, z, d->shift); + + /* r = x - n*ln2/N. */ +- svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x); ++ svfloat32_t r = x; + r = svmls_lane (r, n, lane_consts, 0); ++ r = svmls_lane (r, n, lane_consts, 1); + /* scale = 2^(n/N). */ svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ -+ /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */ - svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); - svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); +- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); +- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); ++ /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); -@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +- svfloat32_t p14 = svmla_x (pg, p12, p34, r2); +- svfloat32_t p0 = svmul_lane (r, lane_consts, 1); +- svfloat32_t poly = svmla_x (pg, p0, r2, p14); ++ svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2); return svmla_x (pg, scale, scale, poly); } - #endif +diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h +index 71f88e02de..c2b196f35b 100644 +--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h ++++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h +@@ -21,11 +21,12 @@ + #define AARCH64_FPU_SV_LOG1P_INLINE_H + + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct sv_log1p_data + { +- double poly[19], ln2[2]; ++ double c0, c2, c4, c6, c8, c10, c12, c14, c16; ++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; ++ double ln2_lo, ln2_hi; + uint64_t hf_rt2_top; + uint64_t one_m_hf_rt2_top; + uint32_t bottom_mask; +@@ -33,15 +34,30 @@ static const struct sv_log1p_data + } sv_log1p_data = { + /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. + */ +- .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, +- 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, +- -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, +- 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, +- -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, +- 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, +- -0x1.cfa7385bdb37ep-6 }, +- .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, ++ .c0 = -0x1.ffffffffffffbp-2, ++ .c1 = 0x1.55555555551a9p-2, ++ .c2 = -0x1.00000000008e3p-2, ++ .c3 = 0x1.9999999a32797p-3, ++ .c4 = -0x1.555555552fecfp-3, ++ .c5 = 0x1.249248e071e5ap-3, ++ .c6 = -0x1.ffffff8bf8482p-4, ++ .c7 = 0x1.c71c8f07da57ap-4, ++ .c8 = -0x1.9999ca4ccb617p-4, ++ .c9 = 0x1.7459ad2e1dfa3p-4, ++ .c10 = -0x1.554d2680a3ff2p-4, ++ .c11 = 0x1.3b4c54d487455p-4, ++ .c12 = -0x1.2548a9ffe80e6p-4, ++ .c13 = 0x1.0f389a24b2e07p-4, ++ .c14 = -0x1.eee4db15db335p-5, ++ .c15 = 0x1.e95b494d4a5ddp-5, ++ .c16 = -0x1.15fdf07cb7c73p-4, ++ .c17 = 0x1.0310b70800fcfp-4, ++ .c18 = -0x1.cfa7385bdb37ep-6, ++ .ln2_lo = 0x1.62e42fefa3800p-1, ++ .ln2_hi = 0x1.ef35793c76730p-45, ++ /* top32(asuint64(sqrt(2)/2)) << 32. */ + .hf_rt2_top = 0x3fe6a09e00000000, ++ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ + .one_m_hf_rt2_top = 0x00095f6200000000, + .bottom_mask = 0xffffffff, + .one_top = 0x3ff +@@ -51,14 +67,14 @@ static inline svfloat64_t + sv_log1p_inline (svfloat64_t x, const svbool_t pg) + { + /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which +- differs from v_log1p_2u5.c by: ++ differs from advsimd/log1p.c by: + - No special-case handling - this should be dealt with by the caller. + - Pairwise Horner polynomial evaluation for improved accuracy. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using svsel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. +- See sv_log1p_2u1.c for details of the algorithm. */ ++ See sve/log1p.c for details of the algorithm. */ + const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data); + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); +@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg) + svfloat64_t cm; + + #ifndef WANT_SV_LOG1P_K0_SHORTCUT +-#error \ ++#error \ + "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" + #elif WANT_SV_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is +@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg) + #endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ +- svfloat64_t f2 = svmul_x (pg, f, f); +- svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly); ++ svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f), ++ f4 = svmul_x (svptrue_b64 (), f2, f2), ++ f8 = svmul_x (svptrue_b64 (), f4, f4), ++ f16 = svmul_x (svptrue_b64 (), f8, f8); ++ ++ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); ++ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); ++ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); ++ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); ++ svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17); ++ ++ /* Order-18 Estrin scheme. */ ++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0); ++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1); ++ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1); ++ ++ svfloat64_t p03 = svmla_x (pg, p01, f2, p23); ++ svfloat64_t p47 = svmla_x (pg, p45, f2, p67); ++ svfloat64_t p07 = svmla_x (pg, p03, f4, p47); ++ ++ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0); ++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1); ++ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0); ++ svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1); ++ ++ svfloat64_t p811 = svmla_x (pg, p89, f2, p1011); ++ svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415); ++ svfloat64_t p815 = svmla_x (pg, p811, f4, p1215); ++ ++ svfloat64_t p015 = svmla_x (pg, p07, f8, p815); ++ svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0); ++ svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1); ++ svfloat64_t p = svmla_x (pg, p015, f16, p1618); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ +- svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]); +- svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]); ++ svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo); ++ svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0); ++ svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1); + +- return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); ++ return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi)); + } +- + #endif +diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c +index 789cc6854f..5869419010 100644 +--- a/sysdeps/aarch64/fpu/tanh_sve.c ++++ b/sysdeps/aarch64/fpu/tanh_sve.c +@@ -18,83 +18,117 @@ + . */ + + #include "sv_math.h" +-#include "poly_sve_f64.h" + + static const struct data + { +- float64_t poly[11]; +- float64_t inv_ln2, ln2_hi, ln2_lo, shift; +- uint64_t thresh, tiny_bound; ++ double ln2_hi, ln2_lo; ++ double c2, c4; ++ double c0, c1, c3; ++ double two_over_ln2, shift; ++ uint64_t tiny_bound; ++ double large_bound, fexpa_bound; ++ uint64_t e2xm1_data[20]; + } data = { +- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ +- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, +- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, +- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, +- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, +- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, +- +- .inv_ln2 = 0x1.71547652b82fep0, +- .ln2_hi = -0x1.62e42fefa39efp-1, +- .ln2_lo = -0x1.abc9e3b39803fp-56, +- .shift = 0x1.8p52, +- ++ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ ++ .c0 = 0x1p-1, ++ .c1 = 0x1.55555555548f9p-3, ++ .c2 = 0x1.5555555554c22p-5, ++ .c3 = 0x1.111123aaa2fb2p-7, ++ .c4 = 0x1.6c16d77d98e5bp-10, ++ .ln2_hi = 0x1.62e42fefa3800p-1, ++ .ln2_lo = 0x1.ef35793c76730p-45, ++ .two_over_ln2 = 0x1.71547652b82fep+1, ++ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */ + .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */ +- /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ +- .thresh = 0x01f241bf835f9d5f, ++ .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54). */ ++ .fexpa_bound = 0x1.a56ef8ec924ccp-4, /* 19/64 * ln2/2. */ ++ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */ ++ .e2xm1_data = { ++ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901, ++ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, ++ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, ++ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a, ++ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7, ++ }, + }; + ++/* An expm1 inspired, FEXPA based helper function that returns an ++ accurate estimate for e^2x - 1. With no special case or support for ++ negative inputs of x. */ + static inline svfloat64_t +-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +-{ +- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from +- the scalar variant of tanh. */ +- +- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ +- svfloat64_t j +- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); +- svint64_t i = svcvt_s64_x (pg, j); +- svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi); +- f = svmla_x (pg, f, j, d->ln2_lo); +- +- /* Approximate expm1(f) using polynomial. */ +- svfloat64_t f2 = svmul_x (pg, f, f); +- svfloat64_t f4 = svmul_x (pg, f2, f2); +- svfloat64_t p = svmla_x ( +- pg, f, f2, +- sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly)); +- +- /* t = 2 ^ i. */ +- svfloat64_t t = svscale_x (pg, sv_f64 (1), i); +- /* expm1(x) = p * t + (t - 1). */ +- return svmla_x (pg, svsub_x (pg, t, 1), p, t); +-} +- +-static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svfloat64_t y, svbool_t special) ++e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d) + { +- return sv_call_f64 (tanh, x, y, special); ++ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2); ++ svuint64_t u = svreinterpret_u64 (z); ++ svfloat64_t n = svsub_x (pg, z, d->shift); ++ ++ /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)]. */ ++ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); ++ svfloat64_t r = svadd_x (pg, x, x); ++ r = svmls_lane (r, n, ln2, 0); ++ r = svmls_lane (r, n, ln2, 1); ++ ++ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); ++ ++ svfloat64_t p; ++ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); ++ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); ++ p = svmad_x (pg, c34, r2, c12); ++ p = svmad_x (pg, p, r, sv_f64 (d->c0)); ++ p = svmad_x (pg, p, r2, r); ++ ++ svfloat64_t scale = svexpa (u); ++ ++ /* We want to construct e2xm1(x) = (scale - 1) + scale * poly. ++ However, for values of scale close to 1, scale-1 causes large ULP errors ++ due to cancellation. ++ ++ This can be circumvented by using a small lookup for scale-1 ++ when our input is below a certain bound, otherwise we can use FEXPA. */ ++ svbool_t is_small = svaclt (pg, x, d->fexpa_bound); ++ ++ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */ ++ svuint64_t base_idx = svand_x (pg, u, 0x1f); ++ ++ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */ ++ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0)); ++ svuint64_t scalem1_lookup ++ = svld1_gather_index (is_small, d->e2xm1_data, base_idx); ++ ++ /* Select the appropriate scale - 1 value based on x. */ ++ svfloat64_t scalem1 ++ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate); ++ return svmla_x (pg, scalem1, scale, p); + } + +-/* SVE approximation for double-precision tanh(x), using a simplified +- version of expm1. The greatest observed error is 2.77 ULP: +- _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 +- want -0x1.bd6a21a163624p-3. */ ++/* SVE approximation for double-precision tanh(x), using a modified version of ++ FEXPA expm1 to calculate e^2x - 1. ++ The greatest observed error is 2.79 + 0.5 ULP: ++ _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9 ++ want 0x1.fff7be486cae9p-9. */ + svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg) + { + const struct data *d = ptr_barrier (&data); + +- svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x)); ++ svbool_t large = svacge (pg, x, d->large_bound); + +- /* Trigger special-cases for tiny, boring and infinity/NaN. */ +- svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh); ++ /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh. ++ As an additional optimisation, we can ensure more accurate values of e^x ++ by only using positive inputs. So we calculate tanh(|x|), and restore the ++ sign of the input before returning. */ ++ svfloat64_t ax = svabs_x (pg, x); ++ svuint64_t sign_bit ++ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); + +- svfloat64_t u = svadd_x (pg, x, x); ++ svfloat64_t p = e2xm1_inline (pg, ax, d); ++ svfloat64_t q = svadd_x (pg, p, 2); + +- /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ +- svfloat64_t q = expm1_inline (u, pg, d); +- svfloat64_t qp2 = svadd_x (pg, q, 2); ++ /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly ++ rounded, at this point we can return 1 directly, with sign correction. ++ This will also act as a guard against our approximation overflowing. */ ++ svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q)); + +- if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (x, svdiv_x (pg, q, qp2), special); +- return svdiv_x (pg, q, qp2); ++ return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y))); + } diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 772b16a358..1c3c392513 100644 --- a/sysdeps/aarch64/multiarch/Makefile @@ -3727,6 +7196,27 @@ + +END (__memset_sve_zva64) +#endif +diff --git a/sysdeps/arm/find_exidx.c b/sysdeps/arm/find_exidx.c +index 60021a072c..468e016214 100644 +--- a/sysdeps/arm/find_exidx.c ++++ b/sysdeps/arm/find_exidx.c +@@ -15,6 +15,7 @@ + License along with the GNU C Library. If not, see + . */ + ++#include + #include + + /* Find the exception index table containing PC. */ +@@ -23,7 +24,7 @@ _Unwind_Ptr + __gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount) + { + struct dl_find_object data; +- if (__dl_find_object ((void *) pc, &data) < 0) ++ if (GLRO(dl_find_object) ((void *) pc, &data) < 0) + return 0; + *pcount = data.dlfo_eh_count; + return (_Unwind_Ptr) data.dlfo_eh_frame; diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index e871f27ff2..ddb34a1588 100644 --- a/sysdeps/generic/ldsodefs.h @@ -5474,6 +8964,18 @@ ifeq ($(subdir),stdlib) gen-as-const-headers += ucontext_i.sym +diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +index 6d63c8a9ec..1acc82d077 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c ++++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #define DCZID_DZP_MASK (1 << 4) + #define DCZID_BS_MASK (0xf) diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c b/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c index 3ff33ef72a..c884efc3b4 100644 --- a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c