summaryrefslogtreecommitdiff
path: root/arch/arc/lib/strcmp.S
diff options
context:
space:
mode:
authorAlexey Brodkin <Alexey.Brodkin@synopsys.com>2014-02-04 12:56:15 +0400
committerTom Rini <trini@ti.com>2014-02-07 08:14:32 -0500
commit2272382879d93d37e7964554cea5b0583c94c247 (patch)
treea17d27c3e4fcaaa9d192cbc0a9d5a2bc7cec8da9 /arch/arc/lib/strcmp.S
parent2f16ac9df4d898ab7266f768f125a638c7c5add8 (diff)
arc: add library functions
These are library functions used by ARC700 architecture. Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Francois Bedard <fbedard@synopsys.com> Cc: Wolfgang Denk <wd@denx.de> Cc: Heiko Schocher <hs@denx.de>
Diffstat (limited to 'arch/arc/lib/strcmp.S')
-rw-r--r--arch/arc/lib/strcmp.S97
1 files changed, 97 insertions, 0 deletions
diff --git a/arch/arc/lib/strcmp.S b/arch/arc/lib/strcmp.S
new file mode 100644
index 0000000000..8cb7d2f18c
--- /dev/null
+++ b/arch/arc/lib/strcmp.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+/*
+ * This is optimized primarily for the ARC700.
+ * It would be possible to speed up the loops by one cycle / word
+ * respective one cycle / byte by forcing double source 1 alignment, unrolling
+ * by a factor of two, and speculatively loading the second word / byte of
+ * source 1; however, that would increase the overhead for loop setup / finish,
+ * and strcmp might often terminate early.
+ */
+
+.global strcmp
+.align 4
+strcmp:
+ or %r2, %r0, %r1
+ bmsk_s %r2, %r2, 1
+ brne %r2, 0, .Lcharloop
+ mov_s %r12, 0x01010101
+ ror %r5, %r12
+.Lwordloop:
+ ld.ab %r2, [%r0, 4]
+ ld.ab %r3, [%r1, 4]
+ nop_s
+ sub %r4, %r2, %r12
+ bic %r4, %r4, %r2
+ and %r4, %r4, %r5
+ brne %r4, 0, .Lfound0
+ breq %r2 ,%r3, .Lwordloop
+#ifdef __LITTLE_ENDIAN__
+ xor %r0, %r2, %r3 /* mask for difference */
+ sub_s %r1, %r0, 1
+ bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
+ sub %r1, %r5, %r0
+ xor %r0, %r5, %r1 /* mask for least significant difference byte */
+ and_s %r2, %r2, %r0
+ and_s %r3, %r3, %r0
+#endif /* _ENDIAN__ */
+ cmp_s %r2, %r3
+ mov_s %r0, 1
+ j_s.d [%blink]
+ bset.lo %r0, %r0, 31
+
+ .balign 4
+#ifdef __LITTLE_ENDIAN__
+.Lfound0:
+ xor %r0, %r2, %r3 /* mask for difference */
+ or %r0, %r0, %r4 /* or in zero indicator */
+ sub_s %r1, %r0, 1
+ bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
+ sub %r1, %r5, %r0
+ xor %r0, %r5, %r1 /* mask for least significant difference byte */
+ and_s %r2, %r2, %r0
+ and_s %r3, %r3, %r0
+ sub.f %r0, %r2, %r3
+ mov.hi %r0, 1
+ j_s.d [%blink]
+ bset.lo %r0, %r0, 31
+#else /* __BIG_ENDIAN__ */
+ /*
+ * The zero-detection above can mis-detect 0x01 bytes as zeroes
+ * because of carry-propagateion from a lower significant zero byte.
+ * We can compensate for this by checking that bit0 is zero.
+ * This compensation is not necessary in the step where we
+ * get a low estimate for r2, because in any affected bytes
+ * we already have 0x00 or 0x01, which will remain unchanged
+ * when bit 7 is cleared.
+ */
+ .balign 4
+.Lfound0:
+ lsr %r0, %r4, 8
+ lsr_s %r1, %r2
+ bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */
+ bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */
+ or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */
+ cmp_s %r3, %r2 /* ... be independent of trailing garbage */
+ or_s %r2, %r2, %r0 /* likewise for r3 > r2 */
+ bic_s %r3, %r3, %r0
+ rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */
+ cmp_s %r2, %r3
+ j_s.d [%blink]
+ bset.lo %r0, %r0, 31
+#endif /* _ENDIAN__ */
+
+ .balign 4
+.Lcharloop:
+ ldb.ab %r2,[%r0,1]
+ ldb.ab %r3,[%r1,1]
+ nop_s
+ breq %r2, 0, .Lcmpend
+ breq %r2, %r3, .Lcharloop
+.Lcmpend:
+ j_s.d [%blink]
+ sub %r0, %r2, %r3