diff options
author | Alexey Brodkin <Alexey.Brodkin@synopsys.com> | 2014-02-04 12:56:15 +0400 |
---|---|---|
committer | Tom Rini <trini@ti.com> | 2014-02-07 08:14:32 -0500 |
commit | 2272382879d93d37e7964554cea5b0583c94c247 (patch) | |
tree | a17d27c3e4fcaaa9d192cbc0a9d5a2bc7cec8da9 /arch/arc/lib/strcmp.S | |
parent | 2f16ac9df4d898ab7266f768f125a638c7c5add8 (diff) |
arc: add library functions
These are library functions used by ARC700 architecture.
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Francois Bedard <fbedard@synopsys.com>
Cc: Wolfgang Denk <wd@denx.de>
Cc: Heiko Schocher <hs@denx.de>
Diffstat (limited to 'arch/arc/lib/strcmp.S')
-rw-r--r-- | arch/arc/lib/strcmp.S | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/arch/arc/lib/strcmp.S b/arch/arc/lib/strcmp.S new file mode 100644 index 0000000000..8cb7d2f18c --- /dev/null +++ b/arch/arc/lib/strcmp.S @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +/* + * This is optimized primarily for the ARC700. + * It would be possible to speed up the loops by one cycle / word + * respective one cycle / byte by forcing double source 1 alignment, unrolling + * by a factor of two, and speculatively loading the second word / byte of + * source 1; however, that would increase the overhead for loop setup / finish, + * and strcmp might often terminate early. + */ + +.global strcmp +.align 4 +strcmp: + or %r2, %r0, %r1 + bmsk_s %r2, %r2, 1 + brne %r2, 0, .Lcharloop + mov_s %r12, 0x01010101 + ror %r5, %r12 +.Lwordloop: + ld.ab %r2, [%r0, 4] + ld.ab %r3, [%r1, 4] + nop_s + sub %r4, %r2, %r12 + bic %r4, %r4, %r2 + and %r4, %r4, %r5 + brne %r4, 0, .Lfound0 + breq %r2 ,%r3, .Lwordloop +#ifdef __LITTLE_ENDIAN__ + xor %r0, %r2, %r3 /* mask for difference */ + sub_s %r1, %r0, 1 + bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ + sub %r1, %r5, %r0 + xor %r0, %r5, %r1 /* mask for least significant difference byte */ + and_s %r2, %r2, %r0 + and_s %r3, %r3, %r0 +#endif /* _ENDIAN__ */ + cmp_s %r2, %r3 + mov_s %r0, 1 + j_s.d [%blink] + bset.lo %r0, %r0, 31 + + .balign 4 +#ifdef __LITTLE_ENDIAN__ +.Lfound0: + xor %r0, %r2, %r3 /* mask for difference */ + or %r0, %r0, %r4 /* or in zero indicator */ + sub_s %r1, %r0, 1 + bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ + sub %r1, %r5, %r0 + xor %r0, %r5, %r1 /* mask for least significant difference byte */ + and_s %r2, %r2, %r0 + and_s %r3, %r3, %r0 + sub.f %r0, %r2, %r3 + mov.hi %r0, 1 + j_s.d [%blink] + bset.lo %r0, %r0, 31 +#else /* __BIG_ENDIAN__ */ + /* + * The zero-detection above can mis-detect 0x01 bytes as zeroes + * because of carry-propagateion from a lower significant zero byte. + * We can compensate for this by checking that bit0 is zero. + * This compensation is not necessary in the step where we + * get a low estimate for r2, because in any affected bytes + * we already have 0x00 or 0x01, which will remain unchanged + * when bit 7 is cleared. + */ + .balign 4 +.Lfound0: + lsr %r0, %r4, 8 + lsr_s %r1, %r2 + bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */ + bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */ + or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */ + cmp_s %r3, %r2 /* ... be independent of trailing garbage */ + or_s %r2, %r2, %r0 /* likewise for r3 > r2 */ + bic_s %r3, %r3, %r0 + rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */ + cmp_s %r2, %r3 + j_s.d [%blink] + bset.lo %r0, %r0, 31 +#endif /* _ENDIAN__ */ + + .balign 4 +.Lcharloop: + ldb.ab %r2,[%r0,1] + ldb.ab %r3,[%r1,1] + nop_s + breq %r2, 0, .Lcmpend + breq %r2, %r3, .Lcharloop +.Lcmpend: + j_s.d [%blink] + sub %r0, %r2, %r3 |