summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/arm/config.mk4
-rw-r--r--arch/arm/include/asm/assembler.h33
-rw-r--r--arch/arm/lib/memcpy.S80
-rw-r--r--arch/arm/lib/memset.S112
4 files changed, 142 insertions, 87 deletions
diff --git a/arch/arm/config.mk b/arch/arm/config.mk
index c339e6dc8c..0667984b69 100644
--- a/arch/arm/config.mk
+++ b/arch/arm/config.mk
@@ -26,7 +26,9 @@ PLATFORM_CPPFLAGS += -D__ARM__
# Choose between ARM/Thumb instruction sets
ifeq ($(CONFIG_SYS_THUMB_BUILD),y)
-PF_CPPFLAGS_ARM := $(call cc-option, -mthumb -mthumb-interwork,\
+AFLAGS_IMPLICIT_IT := $(call as-option,-Wa$(comma)-mimplicit-it=always)
+PF_CPPFLAGS_ARM := $(AFLAGS_IMPLICIT_IT) \
+ $(call cc-option, -mthumb -mthumb-interwork,\
$(call cc-option,-marm,)\
$(call cc-option,-mno-thumb-interwork,)\
)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 5e4789b145..11b80fb190 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -14,12 +14,14 @@
* assembler source.
*/
+#include <config.h>
+
/*
* Endian independent macros for shifting bytes within registers.
*/
#ifndef __ARMEB__
-#define pull lsr
-#define push lsl
+#define lspull lsr
+#define lspush lsl
#define get_byte_0 lsl #0
#define get_byte_1 lsr #8
#define get_byte_2 lsr #16
@@ -29,8 +31,8 @@
#define put_byte_2 lsl #16
#define put_byte_3 lsl #24
#else
-#define pull lsl
-#define push lsr
+#define lspull lsl
+#define lspush lsr
#define get_byte_0 lsr #24
#define get_byte_1 lsr #16
#define get_byte_2 lsr #8
@@ -54,7 +56,28 @@
#define PLD(code...)
#endif
+ .irp c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+ .macro ret\c, reg
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
+ mov\c pc, \reg
+#else
+ .ifeqs "\reg", "lr"
+ bx\c \reg
+ .else
+ mov\c pc, \reg
+ .endif
+#endif
+ .endm
+ .endr
+
/*
- * Cache alligned
+ * Cache aligned, used for optimized memcpy/memset
+ * In the kernel this is only enabled for Feroceon CPU's...
+ * We disable it especially for Thumb builds since those instructions
+ * are not made in a Thumb ready way...
*/
+#ifdef CONFIG_SYS_THUMB_BUILD
+#define CALGN(code...)
+#else
#define CALGN(code...) code
+#endif
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index f655256b5d..eeaf003529 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -10,9 +10,14 @@
* published by the Free Software Foundation.
*/
+#include <linux/linkage.h>
#include <asm/assembler.h>
+#ifdef CONFIG_SYS_THUMB_BUILD
+#define W(instr) instr.w
+#else
#define W(instr) instr
+#endif
#define LDR1W_SHIFT 0
#define STR1W_SHIFT 0
@@ -30,7 +35,7 @@
.endm
.macro ldr1b ptr reg cond=al abort
- ldr\cond\()b \reg, [\ptr], #1
+ ldrb\cond\() \reg, [\ptr], #1
.endm
.macro str1w ptr reg abort
@@ -42,7 +47,7 @@
.endm
.macro str1b ptr reg cond=al abort
- str\cond\()b \reg, [\ptr], #1
+ strb\cond\() \reg, [\ptr], #1
.endm
.macro enter reg1 reg2
@@ -56,10 +61,12 @@
.text
/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-
-.globl memcpy
-memcpy:
-
+ .syntax unified
+#ifdef CONFIG_SYS_THUMB_BUILD
+ .thumb
+ .thumb_func
+#endif
+ENTRY(memcpy)
cmp r0, r1
moveq pc, lr
@@ -79,7 +86,7 @@ memcpy:
CALGN( ands ip, r0, #31 )
CALGN( rsb r3, ip, #32 )
- CALGN( sbcnes r4, r3, r2 ) @ C is always set here
+ CALGN( sbcsne r4, r3, r2 ) @ C is always set here
CALGN( bcs 2f )
CALGN( adr r4, 6f )
CALGN( subs r2, r2, r3 ) @ C gets set
@@ -178,7 +185,7 @@ memcpy:
CALGN( ands ip, r0, #31 )
CALGN( rsb ip, ip, #32 )
- CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( sbcsne r4, ip, r2 ) @ C is always set here
CALGN( subcc r2, r2, ip )
CALGN( bcc 15f )
@@ -193,24 +200,24 @@ memcpy:
12: PLD( pld [r1, #124] )
13: ldr4w r1, r4, r5, r6, r7, abort=19f
- mov r3, lr, pull #\pull
+ mov r3, lr, lspull #\pull
subs r2, r2, #32
ldr4w r1, r8, r9, ip, lr, abort=19f
- orr r3, r3, r4, push #\push
- mov r4, r4, pull #\pull
- orr r4, r4, r5, push #\push
- mov r5, r5, pull #\pull
- orr r5, r5, r6, push #\push
- mov r6, r6, pull #\pull
- orr r6, r6, r7, push #\push
- mov r7, r7, pull #\pull
- orr r7, r7, r8, push #\push
- mov r8, r8, pull #\pull
- orr r8, r8, r9, push #\push
- mov r9, r9, pull #\pull
- orr r9, r9, ip, push #\push
- mov ip, ip, pull #\pull
- orr ip, ip, lr, push #\push
+ orr r3, r3, r4, lspush #\push
+ mov r4, r4, lspull #\pull
+ orr r4, r4, r5, lspush #\push
+ mov r5, r5, lspull #\pull
+ orr r5, r5, r6, lspush #\push
+ mov r6, r6, lspull #\pull
+ orr r6, r6, r7, lspush #\push
+ mov r7, r7, lspull #\pull
+ orr r7, r7, r8, lspush #\push
+ mov r8, r8, lspull #\pull
+ orr r8, r8, r9, lspush #\push
+ mov r9, r9, lspull #\pull
+ orr r9, r9, ip, lspush #\push
+ mov ip, ip, lspull #\pull
+ orr ip, ip, lr, lspush #\push
str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
bge 12b
PLD( cmn r2, #96 )
@@ -221,10 +228,10 @@ memcpy:
14: ands ip, r2, #28
beq 16f
-15: mov r3, lr, pull #\pull
+15: mov r3, lr, lspull #\pull
ldr1w r1, lr, abort=21f
subs ip, ip, #4
- orr r3, r3, lr, push #\push
+ orr r3, r3, lr, lspush #\push
str1w r0, r3, abort=21f
bgt 15b
CALGN( cmp r2, #0 )
@@ -241,3 +248,24 @@ memcpy:
17: forward_copy_shift pull=16 push=16
18: forward_copy_shift pull=24 push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+ .macro copy_abort_preamble
+19: ldmfd sp!, {r5 - r9}
+ b 21f
+20: ldmfd sp!, {r5 - r8}
+21:
+ .endm
+
+ .macro copy_abort_end
+ ldmfd sp!, {r4, pc}
+ .endm
+
+ENDPROC(memcpy)
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 0cdf89535a..7208f20dda 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -9,32 +9,25 @@
*
* ASM optimised string functions
*/
+#include <linux/linkage.h>
#include <asm/assembler.h>
.text
.align 5
- .word 0
-1: subs r2, r2, #4 @ 1 do we have enough
- blt 5f @ 1 bytes to align with?
- cmp r3, #2 @ 1
- strltb r1, [r0], #1 @ 1
- strleb r1, [r0], #1 @ 1
- strb r1, [r0], #1 @ 1
- add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
-/*
- * The pointer is now aligned and the length is adjusted. Try doing the
- * memset again.
- */
-
-.globl memset
-memset:
+ .syntax unified
+#ifdef CONFIG_SYS_THUMB_BUILD
+ .thumb
+ .thumb_func
+#endif
+ENTRY(memset)
ands r3, r0, #3 @ 1 unaligned?
- bne 1b @ 1
+ mov ip, r0 @ preserve r0 as return value
+ bne 6f @ 1
/*
- * we know that the pointer in r0 is aligned to a word boundary.
+ * we know that the pointer in ip is aligned to a word boundary.
*/
- orr r1, r1, r1, lsl #8
+1: orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
mov r3, r1
cmp r2, #16
@@ -43,29 +36,28 @@ memset:
#if ! CALGN(1)+0
/*
- * We need an extra register for this loop - save the return address and
- * use the LR
+ * We need 2 extra registers for this loop - use r8 and the LR
*/
- str lr, [sp, #-4]!
- mov ip, r1
+ stmfd sp!, {r8, lr}
+ mov r8, r1
mov lr, r1
2: subs r2, r2, #64
- stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
- stmgeia r0!, {r1, r3, ip, lr}
- stmgeia r0!, {r1, r3, ip, lr}
- stmgeia r0!, {r1, r3, ip, lr}
+ stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
+ stmiage ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr}
bgt 2b
- ldmeqfd sp!, {pc} @ Now <64 bytes to go.
+ ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go.
/*
* No need to correct the count; we're only testing bits from now on
*/
tst r2, #32
- stmneia r0!, {r1, r3, ip, lr}
- stmneia r0!, {r1, r3, ip, lr}
+ stmiane ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
tst r2, #16
- stmneia r0!, {r1, r3, ip, lr}
- ldr lr, [sp], #4
+ stmiane ip!, {r1, r3, r8, lr}
+ ldmfd sp!, {r8, lr}
#else
@@ -74,53 +66,63 @@ memset:
* whole cache lines at once.
*/
- stmfd sp!, {r4-r7, lr}
+ stmfd sp!, {r4-r8, lr}
mov r4, r1
mov r5, r1
mov r6, r1
mov r7, r1
- mov ip, r1
+ mov r8, r1
mov lr, r1
cmp r2, #96
- tstgt r0, #31
+ tstgt ip, #31
ble 3f
- and ip, r0, #31
- rsb ip, ip, #32
- sub r2, r2, ip
- movs ip, ip, lsl #(32 - 4)
- stmcsia r0!, {r4, r5, r6, r7}
- stmmiia r0!, {r4, r5}
- tst ip, #(1 << 30)
- mov ip, r1
- strne r1, [r0], #4
+ and r8, ip, #31
+ rsb r8, r8, #32
+ sub r2, r2, r8
+ movs r8, r8, lsl #(32 - 4)
+ stmiacs ip!, {r4, r5, r6, r7}
+ stmiami ip!, {r4, r5}
+ tst r8, #(1 << 30)
+ mov r8, r1
+ strne r1, [ip], #4
3: subs r2, r2, #64
- stmgeia r0!, {r1, r3-r7, ip, lr}
- stmgeia r0!, {r1, r3-r7, ip, lr}
+ stmiage ip!, {r1, r3-r8, lr}
+ stmiage ip!, {r1, r3-r8, lr}
bgt 3b
- ldmeqfd sp!, {r4-r7, pc}
+ ldmfdeq sp!, {r4-r8, pc}
tst r2, #32
- stmneia r0!, {r1, r3-r7, ip, lr}
+ stmiane ip!, {r1, r3-r8, lr}
tst r2, #16
- stmneia r0!, {r4-r7}
- ldmfd sp!, {r4-r7, lr}
+ stmiane ip!, {r4-r7}
+ ldmfd sp!, {r4-r8, lr}
#endif
4: tst r2, #8
- stmneia r0!, {r1, r3}
+ stmiane ip!, {r1, r3}
tst r2, #4
- strne r1, [r0], #4
+ strne r1, [ip], #4
/*
* When we get here, we've got less than 4 bytes to zero. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
- strneb r1, [r0], #1
- strneb r1, [r0], #1
+ strbne r1, [ip], #1
+ strbne r1, [ip], #1
tst r2, #1
- strneb r1, [r0], #1
- mov pc, lr
+ strbne r1, [ip], #1
+ ret lr
+
+6: subs r2, r2, #4 @ 1 do we have enough
+ blt 5b @ 1 bytes to align with?
+ cmp r3, #2 @ 1
+ strblt r1, [ip], #1 @ 1
+ strble r1, [ip], #1 @ 1
+ strb r1, [ip], #1 @ 1
+ add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
+ b 1b
+ENDPROC(memset)