; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s

; Check that WLSTP loop is not generated for alignment < 4
; void test1(char* dest, char* src, int n){
;    memcpy(dest, src, n);
; }

declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)

define void @test1(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n){
; CHECK-LABEL: test1:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    bl __aeabi_memcpy
; CHECK-NEXT:    pop {r7, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr align 1 %X, ptr align 1 %Y, i32 %n, i1 false)
  ret void
}


; Check that WLSTP loop is generated for alignment >= 4
; void test2(int* restrict X, int* restrict Y, int n){
;     memcpy(X, Y, n);
; }

define void @test2(ptr noalias %X, ptr noalias readonly %Y, i32 %n){
; CHECK-LABEL: test2:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_2
; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
; CHECK-NEXT:    vstrb.8 q0, [r0], #16
; CHECK-NEXT:    letp lr, .LBB1_1
; CHECK-NEXT:  .LBB1_2: @ %entry
; CHECK-NEXT:    pop {r7, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %X, ptr align 4 %Y, i32 %n, i1 false)
  ret void
}


; Checks that transform handles some arithmetic on the input arguments.
; void test3(int* restrict X, int* restrict Y, int n)
; {
;     memcpy(X+2, Y+3, (n*2)+10);
; }

define void @test3(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test3:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    movs r3, #10
; CHECK-NEXT:    add.w r2, r3, r2, lsl #1
; CHECK-NEXT:    adds r1, #12
; CHECK-NEXT:    adds r0, #8
; CHECK-NEXT:    wlstp.8 lr, r2, .LBB2_2
; CHECK-NEXT:  .LBB2_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
; CHECK-NEXT:    vstrb.8 q0, [r0], #16
; CHECK-NEXT:    letp lr, .LBB2_1
; CHECK-NEXT:  .LBB2_2: @ %entry
; CHECK-NEXT:    pop {r7, pc}
entry:
  %add.ptr = getelementptr inbounds i32, ptr %X, i32 2
  %add.ptr1 = getelementptr inbounds i32, ptr %Y, i32 3
  %mul = shl nsw i32 %n, 1
  %add = add nsw i32 %mul, 10
  call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 4 %add.ptr, ptr nonnull align 4 %add.ptr1, i32 %add, i1 false)
  ret void
}


; Checks that transform handles for loops that are implicitly converted to mempcy
; void test4(int* restrict X, int* restrict Y, int n){
;     for(int i = 0; i < n; ++i){
;         X[i] = Y[i];
;     }
; }

define void @test4(ptr noalias %X, ptr noalias readonly %Y, i32 %n) {
; CHECK-LABEL: test4:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp r2, #1
; CHECK-NEXT:    it lt
; CHECK-NEXT:    bxlt lr
; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    wlstp.8 lr, r2, .LBB3_3
; CHECK-NEXT:  .LBB3_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
; CHECK-NEXT:    vstrb.8 q0, [r0], #16
; CHECK-NEXT:    letp lr, .LBB3_2
; CHECK-NEXT:  .LBB3_3: @ %for.body.preheader
; CHECK-NEXT:    pop.w {r7, lr}
; CHECK-NEXT:    bx lr
entry:
  %cmp6 = icmp sgt i32 %n, 0
  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %X, ptr align 4 %Y, i32 %n, i1 false)
  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
  ret void
}

; Checks that transform can handle > i32 size inputs
define void @test5(ptr noalias %X, ptr noalias %Y, i64 %n){
; CHECK-LABEL: test5:
; CHECK:       @ %bb.0:
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    wlstp.8 lr, r2, .LBB4_2
; CHECK-NEXT:  .LBB4_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
; CHECK-NEXT:    vstrb.8 q0, [r0], #16
; CHECK-NEXT:    letp lr, .LBB4_1
; CHECK-NEXT:  .LBB4_2:
; CHECK-NEXT:    pop {r7, pc}
    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %X, ptr align 4 %Y, i64 %n, i1 false)
    ret void
}

; Checks the transform is applied for constant size inputs below a certain threshold (128 in this case)
define void @test6(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test6:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    movs r2, #127
; CHECK-NEXT:    wlstp.8 lr, r2, .LBB5_2
; CHECK-NEXT:  .LBB5_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
; CHECK-NEXT:    vstrb.8 q0, [r0], #16
; CHECK-NEXT:    letp lr, .LBB5_1
; CHECK-NEXT:  .LBB5_2: @ %entry
; CHECK-NEXT:    pop {r7, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(127) %X, ptr noundef nonnull align 4 dereferenceable(127) %Y, i32 127, i1 false)
  ret void
}

; Checks the transform is NOT applied for constant size inputs above a certain threshold (128 in this case)
define void @test7(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test7:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    movs r2, #128
; CHECK-NEXT:    bl __aeabi_memcpy4
; CHECK-NEXT:    pop {r7, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %X, ptr align 4 %Y, i32 128, i1 false)
  ret void
}

; Checks the transform is NOT applied for constant size inputs below a certain threshold (64 in this case)
define void @test8(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n) {
; CHECK-LABEL: test8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r4, lr}
; CHECK-NEXT:    push {r4, lr}
; CHECK-NEXT:    ldm.w r1!, {r2, r3, r4, r12, lr}
; CHECK-NEXT:    stm.w r0!, {r2, r3, r4, r12, lr}
; CHECK-NEXT:    ldm.w r1!, {r2, r3, r4, r12, lr}
; CHECK-NEXT:    stm.w r0!, {r2, r3, r4, r12, lr}
; CHECK-NEXT:    ldm.w r1, {r2, r3, r4, r12, lr}
; CHECK-NEXT:    stm.w r0, {r2, r3, r4, r12, lr}
; CHECK-NEXT:    pop {r4, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %X, ptr align 4 %Y, i32 60, i1 false)
  ret void
}

; Checks the transform is NOT applied (regardless of alignment) when optimizations are disabled
define void @test9(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n) #0 {
; CHECK-LABEL: test9:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    bl __aeabi_memcpy4
; CHECK-NEXT:    pop {r7, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %X, ptr align 4 %Y, i32 %n, i1 false)
  ret void
}

; Checks the transform is NOT applied (regardless of alignment) when optimization for size is on (-Os or -Oz)
define void @test10(ptr noalias nocapture %X, ptr noalias nocapture readonly %Y, i32 %n) #1 {
; CHECK-LABEL: test10:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    .save {r7, lr}
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    bl __aeabi_memcpy4
; CHECK-NEXT:    pop {r7, pc}
entry:
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %X, ptr align 4 %Y, i32 %n, i1 false)
  ret void
}

define void @test11(ptr nocapture %x, ptr nocapture %y, i32 %n) {
; CHECK-LABEL: test11:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    cmp.w r2, #-1
; CHECK-NEXT:    it gt
; CHECK-NEXT:    bxgt lr
; CHECK-NEXT:  .LBB10_1: @ %prehead
; CHECK-NEXT:    .save {r4, lr}
; CHECK-NEXT:    push {r4, lr}
; CHECK-NEXT:    mov r12, r1
; CHECK-NEXT:    mov r4, r0
; CHECK-NEXT:    wlstp.8 lr, r2, .LBB10_3
; CHECK-NEXT:  .LBB10_2: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrb.u8 q0, [r12