加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
LoongArch-Add-ifunc-support-for-strcpy-stpcpy-aligne.patch 32.06 KB
一键复制 编辑 原始数据 按行查看 历史
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099
From 351086591d938aaf884d475261ae96ec5da00384 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Wed, 13 Sep 2023 15:34:59 +0800
Subject: [PATCH 22/29] LoongArch: Add ifunc support for strcpy,
stpcpy{aligned, unaligned, lsx, lasx}
According to glibc strcpy and stpcpy microbenchmark test results(changed
to use generic_strcpy and generic_stpcpy instead of strlen + memcpy),
comparing with the generic version, this implementation could reduce the
runtime as following:
Name Percent of rutime reduced
strcpy-aligned 8%-45%
strcpy-unaligned 8%-48%, comparing with the aligned version, unaligned
version takes less instructions to copy the tail of data
which length is less than 8. it also has better performance
in case src and dest cannot be both aligned with 8bytes
strcpy-lsx 20%-80%
strcpy-lasx 15%-86%
stpcpy-aligned 6%-43%
stpcpy-unaligned 8%-48%
stpcpy-lsx 10%-80%
stpcpy-lasx 10%-87%
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 8 +
.../lp64/multiarch/ifunc-impl-list.c | 18 ++
.../loongarch/lp64/multiarch/stpcpy-aligned.S | 27 +++
.../loongarch/lp64/multiarch/stpcpy-lasx.S | 22 ++
sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 22 ++
.../lp64/multiarch/stpcpy-unaligned.S | 22 ++
sysdeps/loongarch/lp64/multiarch/stpcpy.c | 42 ++++
.../loongarch/lp64/multiarch/strcpy-aligned.S | 202 ++++++++++++++++
.../loongarch/lp64/multiarch/strcpy-lasx.S | 215 ++++++++++++++++++
sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 212 +++++++++++++++++
.../lp64/multiarch/strcpy-unaligned.S | 138 +++++++++++
sysdeps/loongarch/lp64/multiarch/strcpy.c | 35 +++
12 files changed, 963 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy.c
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 360a6718..39550bea 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -16,6 +16,14 @@ sysdep_routines += \
strcmp-lsx \
strncmp-aligned \
strncmp-lsx \
+ strcpy-aligned \
+ strcpy-unaligned \
+ strcpy-lsx \
+ strcpy-lasx \
+ stpcpy-aligned \
+ stpcpy-unaligned \
+ stpcpy-lsx \
+ stpcpy-lasx \
memcpy-aligned \
memcpy-unaligned \
memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index e397d58c..39a14f1d 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -76,6 +76,24 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
)
+ IFUNC_IMPL (i, name, strcpy,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LASX, __strcpy_lasx)
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned)
+ IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned)
+ )
+
+ IFUNC_IMPL (i, name, stpcpy,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LASX, __stpcpy_lasx)
+ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_UAL, __stpcpy_unaligned)
+ IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
+ )
+
IFUNC_IMPL (i, name, memcpy,
#if !defined __loongarch_soft_float
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
new file mode 100644
index 00000000..1f763db6
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
@@ -0,0 +1,27 @@
+/* stpcpy-aligned implementation is in strcpy-aligned.S.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define STPCPY __stpcpy_aligned
+#else
+# define STPCPY __stpcpy
+#endif
+
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-aligned.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
new file mode 100644
index 00000000..13d6c953
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
@@ -0,0 +1,22 @@
+/* stpcpy-lasx implementation is in strcpy-lasx.S.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPCPY __stpcpy_lasx
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-lasx.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
new file mode 100644
index 00000000..e0f17ab5
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
@@ -0,0 +1,22 @@
+/* stpcpy-lsx implementation is in strcpy-lsx.S.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPCPY __stpcpy_lsx
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-lsx.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
new file mode 100644
index 00000000..cc2f9712
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
@@ -0,0 +1,22 @@
+/* stpcpy-unaligned implementation is in strcpy-unaligned.S.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPCPY __stpcpy_unaligned
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-unaligned.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy.c b/sysdeps/loongarch/lp64/multiarch/stpcpy.c
new file mode 100644
index 00000000..d4860d7a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy.c
@@ -0,0 +1,42 @@
+/* Multiple versions of stpcpy.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define stpcpy __redirect_stpcpy
+# define __stpcpy __redirect___stpcpy
+# define NO_MEMPCPY_STPCPY_REDIRECT
+# define __NO_STRING_INLINES
+# include <string.h>
+# undef stpcpy
+# undef __stpcpy
+
+# define SYMBOL_NAME stpcpy
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
+
+weak_alias (__stpcpy, stpcpy)
+# ifdef SHARED
+__hidden_ver1 (__stpcpy, __GI___stpcpy, __redirect___stpcpy)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (stpcpy);
+__hidden_ver1 (stpcpy, __GI_stpcpy, __redirect_stpcpy)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (stpcpy);
+# endif
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
new file mode 100644
index 00000000..4ed539fd
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
@@ -0,0 +1,202 @@
+/* Optimized strcpy stpcpy aligned implementation using basic LoongArch
+ instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# ifndef STRCPY
+# define STRCPY __strcpy_aligned
+# endif
+#else
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+#endif
+
+LEAF(STRCPY, 6)
+ andi a3, a0, 0x7
+ move a2, a0
+ beqz a3, L(dest_align)
+ sub.d a5, a1, a3
+ addi.d a5, a5, 8
+
+L(make_dest_align):
+ ld.b t0, a1, 0
+ addi.d a1, a1, 1
+ st.b t0, a2, 0
+ addi.d a2, a2, 1
+ beqz t0, L(al_out)
+
+ bne a1, a5, L(make_dest_align)
+
+L(dest_align):
+ andi a4, a1, 7
+ bstrins.d a1, zero, 2, 0
+
+ lu12i.w t5, 0x1010
+ ld.d t0, a1, 0
+ ori t5, t5, 0x101
+ bstrins.d t5, t5, 63, 32
+
+ slli.d t6, t5, 0x7
+ bnez a4, L(unalign)
+ sub.d t1, t0, t5
+ andn t2, t6, t0
+
+ and t3, t1, t2
+ bnez t3, L(al_end)
+
+L(al_loop):
+ st.d t0, a2, 0
+ ld.d t0, a1, 8
+
+ addi.d a1, a1, 8
+ addi.d a2, a2, 8
+ sub.d t1, t0, t5
+ andn t2, t6, t0
+
+ and t3, t1, t2
+ beqz t3, L(al_loop)
+
+L(al_end):
+ ctz.d t1, t3
+ srli.d t1, t1, 3
+ addi.d t1, t1, 1
+
+ andi a3, t1, 8
+ andi a4, t1, 4
+ andi a5, t1, 2
+ andi a6, t1, 1
+
+L(al_end_8):
+ beqz a3, L(al_end_4)
+ st.d t0, a2, 0
+#ifdef USE_AS_STPCPY
+ addi.d a0, a2, 7
+#endif
+ jr ra
+L(al_end_4):
+ beqz a4, L(al_end_2)
+ st.w t0, a2, 0
+ addi.d a2, a2, 4
+ srli.d t0, t0, 32
+L(al_end_2):
+ beqz a5, L(al_end_1)
+ st.h t0, a2, 0
+ addi.d a2, a2, 2
+ srli.d t0, t0, 16
+L(al_end_1):
+ beqz a6, L(al_out)
+ st.b t0, a2, 0
+ addi.d a2, a2, 1
+L(al_out):
+#ifdef USE_AS_STPCPY
+ addi.d a0, a2, -1
+#endif
+ jr ra
+
+ .align 4
+L(unalign):
+ slli.d a5, a4, 3
+ li.d t1, -1
+ sub.d a6, zero, a5
+
+ srl.d a7, t0, a5
+ sll.d t7, t1, a6
+
+ or t0, a7, t7
+ sub.d t1, t0, t5
+ andn t2, t6, t0
+ and t3, t1, t2
+
+ bnez t3, L(un_end)
+
+ ld.d t4, a1, 8
+
+ sub.d t1, t4, t5
+ andn t2, t6, t4
+ sll.d t0, t4, a6
+ and t3, t1, t2
+
+ or t0, t0, a7
+ bnez t3, L(un_end_with_remaining)
+
+L(un_loop):
+ srl.d a7, t4, a5
+
+ ld.d t4, a1, 16
+ addi.d a1, a1, 8
+
+ st.d t0, a2, 0
+ addi.d a2, a2, 8
+
+ sub.d t1, t4, t5
+ andn t2, t6, t4
+ sll.d t0, t4, a6
+ and t3, t1, t2
+
+ or t0, t0, a7
+ beqz t3, L(un_loop)
+
+L(un_end_with_remaining):
+ ctz.d t1, t3
+ srli.d t1, t1, 3
+ addi.d t1, t1, 1
+ sub.d t1, t1, a4
+
+ blt t1, zero, L(un_end_less_8)
+ st.d t0, a2, 0
+ addi.d a2, a2, 8
+ beqz t1, L(un_out)
+ srl.d t0, t4, a5
+ b L(un_end_less_8)
+
+L(un_end):
+ ctz.d t1, t3
+ srli.d t1, t1, 3
+ addi.d t1, t1, 1
+
+L(un_end_less_8):
+ andi a4, t1, 4
+ andi a5, t1, 2
+ andi a6, t1, 1
+L(un_end_4):
+ beqz a4, L(un_end_2)
+ st.w t0, a2, 0
+ addi.d a2, a2, 4
+ srli.d t0, t0, 32
+L(un_end_2):
+ beqz a5, L(un_end_1)
+ st.h t0, a2, 0
+ addi.d a2, a2, 2
+ srli.d t0, t0, 16
+L(un_end_1):
+ beqz a6, L(un_out)
+ st.b t0, a2, 0
+ addi.d a2, a2, 1
+L(un_out):
+#ifdef USE_AS_STPCPY
+ addi.d a0, a2, -1
+#endif
+ jr ra
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
new file mode 100644
index 00000000..c2825612
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
@@ -0,0 +1,215 @@
+/* Optimized strcpy stpcpy implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# ifndef STRCPY
+# define STRCPY __strcpy_lasx
+# endif
+
+# ifdef USE_AS_STPCPY
+# define dstend a0
+# else
+# define dstend a4
+# endif
+
+LEAF(STRCPY, 6)
+ ori t8, zero, 0xfe0
+ andi t0, a1, 0xfff
+ li.d t7, -1
+ move a2, a0
+
+ bltu t8, t0, L(page_cross_start)
+L(start_entry):
+ xvld xr0, a1, 0
+ li.d t0, 32
+ andi t1, a2, 0x1f
+
+ xvsetanyeqz.b fcc0, xr0
+ sub.d t0, t0, t1
+ bcnez fcc0, L(end)
+ add.d a1, a1, t0
+
+ xvst xr0, a2, 0
+ andi a3, a1, 0x1f
+ add.d a2, a2, t0
+ bnez a3, L(unaligned)
+
+
+ xvld xr0, a1, 0
+ xvsetanyeqz.b fcc0, xr0
+ bcnez fcc0, L(al_end)
+L(al_loop):
+ xvst xr0, a2, 0
+
+ xvld xr0, a1, 32
+ addi.d a2, a2, 32
+ addi.d a1, a1, 32
+ xvsetanyeqz.b fcc0, xr0
+
+ bceqz fcc0, L(al_loop)
+L(al_end):
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+
+ movfr2gr.s t0, fa0
+ cto.w t0, t0
+ add.d a1, a1, t0
+ xvld xr0, a1, -31
+
+
+ add.d dstend, a2, t0
+ xvst xr0, dstend, -31
+ jr ra
+ nop
+
+L(page_cross_start):
+ move a4, a1
+ bstrins.d a4, zero, 4, 0
+ xvld xr0, a4, 0
+ xvmsknz.b xr0, xr0
+
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+ sra.w t0, t0, a1
+
+ beq t0, t7, L(start_entry)
+ b L(tail)
+L(unaligned):
+ andi t0, a1, 0xfff
+ bltu t8, t0, L(un_page_cross)
+
+
+L(un_start_entry):
+ xvld xr0, a1, 0
+ xvsetanyeqz.b fcc0, xr0
+ bcnez fcc0, L(un_end)
+ addi.d a1, a1, 32
+
+L(un_loop):
+ xvst xr0, a2, 0
+ andi t0, a1, 0xfff
+ addi.d a2, a2, 32
+ bltu t8, t0, L(page_cross_loop)
+
+L(un_loop_entry):
+ xvld xr0, a1, 0
+ addi.d a1, a1, 32
+ xvsetanyeqz.b fcc0, xr0
+ bceqz fcc0, L(un_loop)
+
+ addi.d a1, a1, -32
+L(un_end):
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+
+
+ movfr2gr.s t0, fa0
+L(un_tail):
+ cto.w t0, t0
+ add.d a1, a1, t0
+ xvld xr0, a1, -31
+
+ add.d dstend, a2, t0
+ xvst xr0, dstend, -31
+ jr ra
+L(un_page_cross):
+ sub.d a4, a1, a3
+
+ xvld xr0, a4, 0
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+
+ movfr2gr.s t0, fa0
+ sra.w t0, t0, a1
+ beq t0, t7, L(un_start_entry)
+ b L(un_tail)
+
+
+L(page_cross_loop):
+ sub.d a4, a1, a3
+ xvld xr0, a4, 0
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+ sra.w t0, t0, a1
+ beq t0, t7, L(un_loop_entry)
+
+ b L(un_tail)
+L(end):
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+
+ movfr2gr.s t0, fa0
+L(tail):
+ cto.w t0, t0
+ add.d dstend, a2, t0
+ add.d a5, a1, t0
+
+L(less_32):
+ srli.d t1, t0, 4
+ beqz t1, L(less_16)
+ vld vr0, a1, 0
+ vld vr1, a5, -15
+
+ vst vr0, a2, 0
+ vst vr1, dstend, -15
+ jr ra
+L(less_16):
+ srli.d t1, t0, 3
+
+ beqz t1, L(less_8)
+ ld.d t2, a1, 0
+ ld.d t3, a5, -7
+ st.d t2, a2, 0
+
+ st.d t3, dstend, -7
+ jr ra
+L(less_8):
+ li.d t1, 3
+ bltu t0, t1, L(less_3)
+
+ ld.w t2, a1, 0
+ ld.w t3, a5, -3
+ st.w t2, a2, 0
+ st.w t3, dstend, -3
+
+ jr ra
+L(less_3):
+ beqz t0, L(zero_byte)
+ ld.h t2, a1, 0
+
+ st.h t2, a2, 0
+L(zero_byte):
+ st.b zero, dstend, 0
+ jr ra
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
new file mode 100644
index 00000000..fc2498f7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
@@ -0,0 +1,212 @@
+/* Optimized strcpy stpcpy implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# ifndef STRCPY
+# define STRCPY __strcpy_lsx
+# endif
+
+LEAF(STRCPY, 6)
+ pcalau12i t0, %pc_hi20(L(INDEX))
+ andi a4, a1, 0xf
+ vld vr1, t0, %pc_lo12(L(INDEX))
+ move a2, a0
+
+ beqz a4, L(load_start)
+ xor t0, a1, a4
+ vld vr0, t0, 0
+ vreplgr2vr.b vr2, a4
+
+ vadd.b vr2, vr2, vr1
+ vshuf.b vr0, vr2, vr0, vr2
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(end)
+
+L(load_start):
+ vld vr0, a1, 0
+ li.d t1, 16
+ andi a3, a2, 0xf
+ vsetanyeqz.b fcc0, vr0
+
+
+ sub.d t0, t1, a3
+ bcnez fcc0, L(end)
+ add.d a1, a1, t0
+ vst vr0, a2, 0
+
+ andi a3, a1, 0xf
+ add.d a2, a2, t0
+ bnez a3, L(unaligned)
+ vld vr0, a1, 0
+
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(al_end)
+L(al_loop):
+ vst vr0, a2, 0
+ vld vr0, a1, 16
+
+ addi.d a2, a2, 16
+ addi.d a1, a1, 16
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(al_loop)
+
+
+L(al_end):
+ vmsknz.b vr1, vr0
+ movfr2gr.s t0, fa1
+ cto.w t0, t0
+ add.d a1, a1, t0
+
+ vld vr0, a1, -15
+# ifdef USE_AS_STPCPY
+ add.d a0, a2, t0
+ vst vr0, a0, -15
+# else
+ add.d a2, a2, t0
+ vst vr0, a2, -15
+# endif
+ jr ra
+
+L(end):
+ vmsknz.b vr1, vr0
+ movfr2gr.s t0, fa1
+ cto.w t0, t0
+ addi.d t0, t0, 1
+
+L(end_16):
+ andi t1, t0, 16
+ beqz t1, L(end_8)
+ vst vr0, a2, 0
+# ifdef USE_AS_STPCPY
+ addi.d a0, a2, 15
+# endif
+ jr ra
+
+L(end_8):
+ andi t2, t0, 8
+ andi t3, t0, 4
+ andi t4, t0, 2
+ andi t5, t0, 1
+
+ beqz t2, L(end_4)
+ vstelm.d vr0, a2, 0, 0
+ addi.d a2, a2, 8
+ vbsrl.v vr0, vr0, 8
+
+L(end_4):
+ beqz t3, L(end_2)
+ vstelm.w vr0, a2, 0, 0
+ addi.d a2, a2, 4
+ vbsrl.v vr0, vr0, 4
+
+L(end_2):
+ beqz t4, L(end_1)
+ vstelm.h vr0, a2, 0, 0
+ addi.d a2, a2, 2
+ vbsrl.v vr0, vr0, 2
+
+
+L(end_1):
+ beqz t5, L(out)
+ vstelm.b vr0, a2, 0, 0
+ addi.d a2, a2, 1
+L(out):
+# ifdef USE_AS_STPCPY
+ addi.d a0, a2, -1
+# endif
+ jr ra
+
+ .align 4
+L(unaligned):
+ bstrins.d a1, zero, 3, 0
+ vld vr2, a1, 0
+ vreplgr2vr.b vr3, a3
+ vslt.b vr4, vr1, vr3
+
+ vor.v vr0, vr2, vr4
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_first_end)
+ vld vr0, a1, 16
+
+ vadd.b vr3, vr3, vr1
+ vshuf.b vr4, vr0, vr2, vr3
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_end)
+
+
+ vor.v vr2, vr0, vr0
+ addi.d a1, a1, 16
+L(un_loop):
+ vld vr0, a1, 16
+ vst vr4, a2, 0
+
+ addi.d a2, a2, 16
+ vshuf.b vr4, vr0, vr2, vr3
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_end)
+
+ vld vr2, a1, 32
+ vst vr4, a2, 0
+ addi.d a1, a1, 32
+ addi.d a2, a2, 16
+
+ vshuf.b vr4, vr2, vr0, vr3
+ vsetanyeqz.b fcc0, vr2
+ bceqz fcc0, L(un_loop)
+ vor.v vr0, vr2, vr2
+
+
+ addi.d a1, a1, -16
+L(un_end):
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, 1f
+ vst vr4, a2, 0
+
+1:
+ vmsknz.b vr1, vr0
+ movfr2gr.s t0, fa1
+ cto.w t0, t0
+ add.d a1, a1, t0
+
+ vld vr0, a1, 1
+ add.d a2, a2, t0
+ sub.d a2, a2, a3
+ vst vr0, a2, 1
+# ifdef USE_AS_STPCPY
+ addi.d a0, a2, 16
+# endif
+ jr ra
+L(un_first_end):
+ addi.d a2, a2, -16
+ addi.d a1, a1, -16
+ b 1b
+END(STRCPY)
+
+ .section .rodata.cst16,"M",@progbits,16
+ .align 4
+L(INDEX):
+ .dword 0x0706050403020100
+ .dword 0x0f0e0d0c0b0a0908
+
+libc_hidden_builtin_def (STRCPY)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
new file mode 100644
index 00000000..9e31883b
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
@@ -0,0 +1,138 @@
+/* Optimized strcpy unaligned implementation using basic LoongArch
+ instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_unaligned
+# endif
+
+# ifdef USE_AS_STPCPY
+# define dstend a0
+# else
+# define dstend a4
+# endif
+
+LEAF(STRCPY, 6)
+ lu12i.w t5, 0x01010
+ li.w t0, 0xff8
+ ori t5, t5, 0x101
+ andi t1, a1, 0xfff
+
+ bstrins.d t5, t5, 63, 32
+ move a2, a0
+ slli.d t6, t5, 7
+ bltu t0, t1, L(page_cross)
+
+L(start_entry):
+ ld.d t0, a1, 0
+ li.d t3, 8
+ andi a3, a1, 0x7
+ sub.d t1, t0, t5
+
+ andn t2, t6, t0
+ sub.d t3, t3, a3
+ and t1, t1, t2
+ bnez t1, L(end)
+
+
+ add.d a1, a1, t3
+ st.d t0, a2, 0
+ add.d a2, a2, t3
+ ld.d t0, a1, 0
+
+ sub.d t1, t0, t5
+ andn t2, t6, t0
+ and t1, t1, t2
+ bnez t1, L(long_end)
+
+L(loop):
+ st.d t0, a2, 0
+ ld.d t0, a1, 8
+ addi.d a2, a2, 8
+ addi.d a1, a1, 8
+
+ sub.d t1, t0, t5
+ andn t2, t6, t0
+ and t1, t1, t2
+ beqz t1, L(loop)
+
+
+L(long_end):
+ ctz.d t1, t1
+ srli.d t1, t1, 3
+ add.d a1, a1, t1
+ ld.d t0, a1, -7
+
+ add.d dstend, a2, t1
+ st.d t0, dstend, -7
+ jr ra
+ nop
+
+L(end):
+ ctz.d t1, t1
+ srli.d t1, t1, 3
+ add.d a3, a1, t1
+ add.d dstend, a2, t1
+
+L(less_8):
+ li.d t0, 3
+ bltu t1, t0, L(less_3)
+ ld.w t1, a1, 0
+ ld.w t2, a3, -3
+
+
+ st.w t1, a2, 0
+ st.w t2, dstend, -3
+ jr ra
+L(less_3):
+ beqz t1, L(zero_bytes)
+
+ ld.h t1, a1, 0
+ st.h t1, a2, 0
+L(zero_bytes):
+ st.b zero, dstend, 0
+ jr ra
+
+L(page_cross):
+ move a4, a1
+ bstrins.d a4, zero, 2, 0
+ ld.d t0, a4, 0
+ li.d t3, -1
+
+ slli.d t4, a1, 3
+ srl.d t3, t3, t4
+ srl.d t0, t0, t4
+ orn t0, t0, t3
+
+
+ sub.d t1, t0, t5
+ andn t2, t6, t0
+ and t1, t1, t2
+ beqz t1, L(start_entry)
+
+ b L(end)
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy.c b/sysdeps/loongarch/lp64/multiarch/strcpy.c
new file mode 100644
index 00000000..46afd068
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strcpy.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strcpy __redirect_strcpy
+# include <string.h>
+# undef strcpy
+
+# define SYMBOL_NAME strcpy
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strcpy, __GI_strcpy, __redirect_strcpy)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strcpy);
+# endif
+#endif
--
2.33.0
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化