diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 424388a30e99..d1a303b41dee 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -24,12 +24,12 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmax_ret_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmax_ret_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f32: ; GFX11: ; %bb.0: @@ -96,12 +96,12 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmax_noret_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_max_f32 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmax_noret_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_max_f32 v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f32: ; GFX11: ; %bb.0: @@ -168,14 +168,14 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmax_ret_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_max_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmax_ret_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: ds_max_rtn_f64 v[0:1], v0, v[4:5] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f64: ; GFX11: ; %bb.0: @@ -244,14 +244,14 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmax_noret_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_max_f64 v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmax_noret_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: ds_max_f64 v0, v[4:5] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f64: ; GFX11: ; %bb.0: @@ -320,30 +320,30 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -466,29 +466,29 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -626,14 +626,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -781,14 +781,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -911,30 +911,30 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB8_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1053,29 +1053,29 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB9_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1212,14 +1212,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1365,14 +1365,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1497,32 +1497,32 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1653,31 +1653,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1826,15 +1826,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1996,15 +1996,15 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index b52a39f1a55c..b8538cbf254f 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -24,12 +24,12 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmin_ret_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmin_ret_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f32: ; GFX11: ; %bb.0: @@ -96,12 +96,12 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmin_noret_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_min_f32 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmin_noret_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_min_f32 v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f32: ; GFX11: ; %bb.0: @@ -168,14 +168,14 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmin_ret_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_min_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmin_ret_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: ds_min_rtn_f64 v[0:1], v0, v[4:5] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f64: ; GFX11: ; %bb.0: @@ -244,14 +244,14 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fmin_noret_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_min_f64 v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fmin_noret_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: ds_min_f64 v0, v[4:5] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f64: ; GFX11: ; %bb.0: @@ -320,30 +320,30 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -466,29 +466,29 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -626,14 +626,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -781,14 +781,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -911,30 +911,30 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB8_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1053,29 +1053,29 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB9_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1212,14 +1212,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1365,14 +1365,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1497,32 +1497,32 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1653,31 +1653,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1826,15 +1826,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1996,15 +1996,15 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll index 0816eae28f61..714328a42d67 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX12 %s @@ -20,19 +20,19 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32 ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -67,20 +67,20 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32> ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -116,20 +116,20 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32> ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -167,22 +167,22 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32 ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -219,19 +219,19 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -266,20 +266,20 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -315,20 +315,20 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn ; GFX12: bb.1 (%ir-block.0): @@ -366,22 +366,22 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn ; GFX12: bb.1 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll index c0b84c914ce5..fb95d99e9f65 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll @@ -1,24 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offset_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn ; GFX11: bb.1 (%ir-block.0): @@ -54,21 +54,21 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> } define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn ; GFX11: bb.1 (%ir-block.0): @@ -106,21 +106,21 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i } define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn ; GFX11: bb.1 (%ir-block.0): @@ -158,23 +158,23 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i } define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn ; GFX11: bb.1 (%ir-block.0): @@ -216,20 +216,20 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn ; GFX11: bb.1 (%ir-block.0): @@ -265,21 +265,21 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn ; GFX11: bb.1 (%ir-block.0): @@ -317,21 +317,21 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn ; GFX11: bb.1 (%ir-block.0): @@ -369,23 +369,23 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn ; GFX11: bb.1 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll index 7c4069b4b313..f71f573e5a79 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll @@ -1,391 +1,391 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_idxen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_bothen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_idxen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_bothen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offset_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_idxen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_bothen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll index 9514bea86e4d..3ef735ddb763 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn @@ -18,19 +18,19 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, < ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } @@ -51,20 +51,20 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4 ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -85,20 +85,20 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4 ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -121,22 +121,22 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, < ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -156,19 +156,19 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } @@ -189,20 +189,20 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -223,20 +223,20 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -259,22 +259,22 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll index 632ad55fdf89..756f287b7798 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-rtn.ll @@ -1,165 +1,165 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_idxen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_bothen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret <2 x half> %ret } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll index 7a97ac8211f6..340e293cda7b 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic ; GFX11: bb.1 (%ir-block.0): @@ -30,17 +30,17 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic ; GFX11: bb.1 (%ir-block.0): @@ -58,16 +58,16 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data } define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): @@ -84,17 +84,17 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index c9ab351f9401..a3562a18631d 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -1,43 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll index 0896e4dc7af1..5909fe3d3694 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll @@ -1,66 +1,66 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX942-LABEL: name: flat_atomic_fadd_v2f16_rtn + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) + ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_rtn - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[COPY3]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX942-LABEL: name: flat_atomic_fadd_v2f16_saddr_rtn + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[COPY3]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) + ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: flat_atomic_fadd_v2f16_no_rtn + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) + ; GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_no_rtn - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: flat_atomic_fadd_v2f16_saddr_no_rtn + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr) + ; GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 9b9249b62b0b..00c44c27257b 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -1,6 +1,6 @@ ; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -mcpu=gfx900 | FileCheck -check-prefixes=GCN,RW-FLAT %s ; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx940 < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx942 < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s target triple = "amdgcn-amd-amdhsa" diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 7cafa2f608a4..6a2b696a7fe7 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s @@ -46,19 +46,19 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb @@ -122,19 +122,19 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_sindex_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; UNALIGNED_GFX940-NEXT: s_and_b32 s0, s0, 15 -; UNALIGNED_GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_sindex_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; UNALIGNED_GFX942-NEXT: s_and_b32 s0, s0, 15 +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_sindex_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -204,18 +204,18 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vindex_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vindex_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb @@ -274,18 +274,18 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_vindex_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_vindex_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -356,19 +356,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_vindex_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_add_u32_e32 v1, s32, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_vindex_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_foo: ; GFX11: ; %bb.0: ; %bb @@ -432,19 +432,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_vindex_foo: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, s32, v1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_vindex_foo: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, s32, v1 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_foo: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -503,13 +503,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: private_ptr_foo: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: private_ptr_foo: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_ptr_foo: ; GFX11: ; %bb.0: @@ -544,13 +544,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: private_ptr_foo: -; UNALIGNED_GFX940: ; %bb.0: -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: private_ptr_foo: +; UNALIGNED_GFX942: ; %bb.0: +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 0x41200000 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: private_ptr_foo: ; UNALIGNED_GFX11: ; %bb.0: @@ -617,23 +617,23 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_small_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x100 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x100 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_small_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NEXT: scratch_load_dword v0, off, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_addk_i32 s1, 0x100 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_addk_i32 s0, 0x100 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb @@ -713,23 +713,23 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_sindex_small_offset_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; UNALIGNED_GFX940-NEXT: s_and_b32 s0, s0, 15 -; UNALIGNED_GFX940-NEXT: s_addk_i32 s1, 0x100 -; UNALIGNED_GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_addk_i32 s0, 0x100 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_sindex_small_offset_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; UNALIGNED_GFX942-NEXT: s_and_b32 s0, s0, 15 +; UNALIGNED_GFX942-NEXT: s_addk_i32 s1, 0x100 +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_addk_i32 s0, 0x100 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_sindex_small_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -819,21 +819,21 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vindex_small_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0x100, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vindex_small_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb @@ -906,21 +906,21 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_vindex_small_offset_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v0, 0x100, v0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_vindex_small_offset_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -1007,22 +1007,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_vindex_small_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_add_u32_e32 v1, s32, v1 -; GFX940-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_vindex_small_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_small_offset_foo: ; GFX11: ; %bb.0: ; %bb @@ -1098,22 +1098,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_vindex_small_offset_foo: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, s32, v1 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 0x100, v1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_vindex_small_offset_foo: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, s32, v1 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_small_offset_foo: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -1206,23 +1206,23 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_large_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x4004 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_large_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_addk_i32 s1, 0x4004 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_addk_i32 s0, 0x4004 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb @@ -1302,23 +1302,23 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_sindex_large_offset_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; UNALIGNED_GFX940-NEXT: s_and_b32 s0, s0, 15 -; UNALIGNED_GFX940-NEXT: s_addk_i32 s1, 0x4004 -; UNALIGNED_GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_addk_i32 s0, 0x4004 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_sindex_large_offset_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; UNALIGNED_GFX942-NEXT: s_and_b32 s0, s0, 15 +; UNALIGNED_GFX942-NEXT: s_addk_i32 s1, 0x4004 +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_addk_i32 s0, 0x4004 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_sindex_large_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -1408,22 +1408,22 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vindex_large_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: s_movk_i32 s0, 0x4004 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0x4004, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vindex_large_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: s_movk_i32 s0, 0x4004 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb @@ -1497,22 +1497,22 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_vindex_large_offset_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX940-NEXT: s_movk_i32 s0, 0x4004 -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v0, 0x4004, v0 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_vindex_large_offset_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x4004 +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -1600,23 +1600,23 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_vindex_large_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_add_u32_e32 v1, s32, v1 -; GFX940-NEXT: v_add_u32_e32 v1, 0x4004, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_vindex_large_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX942-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_large_offset_foo: ; GFX11: ; %bb.0: ; %bb @@ -1693,23 +1693,23 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_vindex_large_offset_foo: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, s32, v1 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 0x4004, v1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_vindex_large_offset_foo: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, s32, v1 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_large_offset_foo: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -1796,19 +1796,19 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_large_imm_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_mov_b32_e32 v0, 13 -; GFX940-NEXT: s_movk_i32 s0, 0x3e80 -; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_add_i32 s0, s0, 4 -; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_large_imm_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: s_movk_i32 s0, 0x3e80 +; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_add_i32 s0, s0, 4 +; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb @@ -1870,19 +1870,19 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_large_imm_offset_kernel: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 13 -; UNALIGNED_GFX940-NEXT: s_movk_i32 s0, 0x3e80 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX940-NEXT: s_add_i32 s0, s0, 4 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -1952,21 +1952,21 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_large_imm_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0x3e80 -; GFX940-NEXT: v_mov_b32_e32 v0, 13 -; GFX940-NEXT: s_add_i32 s1, s32, s0 -; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_add_i32 s0, s1, 4 -; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_large_imm_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0x3e80 +; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: s_add_i32 s1, s32, s0 +; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_add_i32 s0, s1, 4 +; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_large_imm_offset_foo: ; GFX11: ; %bb.0: ; %bb @@ -2033,21 +2033,21 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_large_imm_offset_foo: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_movk_i32 s0, 0x3e80 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 13 -; UNALIGNED_GFX940-NEXT: s_add_i32 s1, s32, s0 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX940-NEXT: s_add_i32 s0, s1, 4 -; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4 +; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -2123,18 +2123,18 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vidx_sidx_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vidx_sidx_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb @@ -2192,18 +2192,18 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: store_load_vidx_sidx_offset: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 -; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: store_load_vidx_sidx_offset: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vidx_sidx_offset: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -2264,15 +2264,15 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_i64_aligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_i64_aligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 +; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_i64_aligned: ; GFX11: ; %bb.0: ; %bb @@ -2323,15 +2323,15 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_i64_aligned: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_i64_aligned: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 +; UNALIGNED_GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_i64_aligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -2388,15 +2388,15 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_i64_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_i64_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 +; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_i64_unaligned: ; GFX11: ; %bb.0: ; %bb @@ -2525,59 +2525,59 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 15 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 4, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v3, 1, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 6, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v3, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 5, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 7, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr3 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v5, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v7, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v6, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v8, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_i64_unaligned: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v0, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v3, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v2, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v5, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v1, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v7, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v6, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v8, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v5, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v7, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v6, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v4, v8, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -2703,20 +2703,20 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_v3i32_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b32 s2, 3 -; GFX940-NEXT: s_mov_b32 s1, 2 -; GFX940-NEXT: s_mov_b32 s0, 1 -; GFX940-NEXT: v_mov_b32_e32 v4, s2 -; GFX940-NEXT: v_mov_b32_e32 v3, s1 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_v3i32_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s2, 3 +; GFX942-NEXT: s_mov_b32 s1, 2 +; GFX942-NEXT: s_mov_b32 s0, 1 +; GFX942-NEXT: v_mov_b32_e32 v4, s2 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_v3i32_unaligned: ; GFX11: ; %bb.0: ; %bb @@ -2900,85 +2900,85 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 4, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 6, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 8, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 10, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v12, 3 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 5, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 9, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v12, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 11, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v6, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_v3i32_unaligned: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v0, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v5, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v6, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v7, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v9, v12, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v10, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v6, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -3146,20 +3146,20 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_v4i32_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b32 s3, 4 -; GFX940-NEXT: s_mov_b32 s2, 3 -; GFX940-NEXT: s_mov_b32 s1, 2 -; GFX940-NEXT: s_mov_b32 s0, 1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_v4i32_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s3, 4 +; GFX942-NEXT: s_mov_b32 s2, 3 +; GFX942-NEXT: s_mov_b32 s1, 2 +; GFX942-NEXT: s_mov_b32 s0, 1 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_v4i32_unaligned: ; GFX11: ; %bb.0: ; %bb @@ -3390,109 +3390,109 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v6, 4 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 4, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 6, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 8, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 10, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v13, 3 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v14, 12, v0 -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v15, 14, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 5, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 9, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v13, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v13, 11, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v13, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v16, 13, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v14, v6, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v16, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 15, v0 -; UNALIGNED_GFX940-NEXT: scratch_store_byte v15, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v3, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v13, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v14, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v16, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr15 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr13 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr14 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 -; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] +; UNALIGNED_GFX942-LABEL: store_load_v4i32_unaligned: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v0, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v5, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v7, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v9, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v10, v13, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v13, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v14, v6, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v16, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_byte v15, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_store_byte v6, v3, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v13, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v14, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v16, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX942-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX942-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -3685,13 +3685,13 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_large_offset: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8 -; GFX940-NEXT: scratch_load_dword v2, off, s0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_large_offset: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_add_u32 s0, s0, 0xffe8 +; GFX942-NEXT: scratch_load_dword v2, off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: sgpr_base_large_offset: ; GFX11: ; %bb.0: ; %entry @@ -3730,13 +3730,13 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; UNALIGNED_GFX10-NEXT: global_store_dword v[0:1], v2, off ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: sgpr_base_large_offset: -; UNALIGNED_GFX940: ; %bb.0: ; %entry -; UNALIGNED_GFX940-NEXT: s_add_u32 s0, s0, 0xffe8 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v2, off, s0 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: sgpr_base_large_offset: +; UNALIGNED_GFX942: ; %bb.0: ; %entry +; UNALIGNED_GFX942-NEXT: s_add_u32 s0, s0, 0xffe8 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v2, off, s0 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: sgpr_base_large_offset: ; UNALIGNED_GFX11: ; %bb.0: ; %entry @@ -3784,14 +3784,14 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_large_offset_split: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_and_b32 s0, s0, -4 -; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8 -; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_large_offset_split: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_and_b32 s0, s0, -4 +; GFX942-NEXT: s_add_u32 s0, s0, 0x100ffe8 +; GFX942-NEXT: scratch_load_dword v2, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: sgpr_base_large_offset_split: ; GFX11: ; %bb.0: ; %entry @@ -3837,14 +3837,14 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; UNALIGNED_GFX10-NEXT: global_store_dword v[0:1], v2, off ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: sgpr_base_large_offset_split: -; UNALIGNED_GFX940: ; %bb.0: ; %entry -; UNALIGNED_GFX940-NEXT: s_and_b32 s0, s0, -4 -; UNALIGNED_GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: sgpr_base_large_offset_split: +; UNALIGNED_GFX942: ; %bb.0: ; %entry +; UNALIGNED_GFX942-NEXT: s_and_b32 s0, s0, -4 +; UNALIGNED_GFX942-NEXT: s_add_u32 s0, s0, 0x100ffe8 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v2, off, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: sgpr_base_large_offset_split: ; UNALIGNED_GFX11: ; %bb.0: ; %entry @@ -3902,15 +3902,15 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_add_u32_e32 v0, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8 -; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_add_u32_e32 v0, s1, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xffe8 +; GFX942-NEXT: v_add3_u32 v0, s0, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; GFX11: ; %bb.0: ; %bb @@ -3955,15 +3955,15 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v0, s1, v0 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8 -; UNALIGNED_GFX940-NEXT: v_add3_u32 v0, s0, v0, v1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s1, v0 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 0xffe8 +; UNALIGNED_GFX942-NEXT: v_add3_u32 v0, s0, v0, v1 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -4015,14 +4015,14 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_add_u32_e32 v0, s1, v0 -; GFX940-NEXT: v_add3_u32 v0, s0, v0, -16 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_add_u32_e32 v0, s1, v0 +; GFX942-NEXT: v_add3_u32 v0, s0, v0, -16 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; GFX11: ; %bb.0: ; %bb @@ -4066,14 +4066,14 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: -; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v0, s1, v0 -; UNALIGNED_GFX940-NEXT: v_add3_u32 v0, s0, v0, -16 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s1, v0 +; UNALIGNED_GFX942-NEXT: v_add3_u32 v0, s0, v0, -16 +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; UNALIGNED_GFX11: ; %bb.0: ; %bb @@ -4122,13 +4122,13 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_negative_offset: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8 -; GFX940-NEXT: scratch_load_dword v2, off, s0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_negative_offset: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_add_u32 s0, s0, 0xffffffe8 +; GFX942-NEXT: scratch_load_dword v2, off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: sgpr_base_negative_offset: ; GFX11: ; %bb.0: ; %entry @@ -4165,13 +4165,13 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; UNALIGNED_GFX10-NEXT: global_store_dword v[0:1], v2, off ; UNALIGNED_GFX10-NEXT: s_endpgm ; -; UNALIGNED_GFX940-LABEL: sgpr_base_negative_offset: -; UNALIGNED_GFX940: ; %bb.0: ; %entry -; UNALIGNED_GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8 -; UNALIGNED_GFX940-NEXT: scratch_load_dword v2, off, s0 -; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; UNALIGNED_GFX940-NEXT: s_endpgm +; UNALIGNED_GFX942-LABEL: sgpr_base_negative_offset: +; UNALIGNED_GFX942: ; %bb.0: ; %entry +; UNALIGNED_GFX942-NEXT: s_add_u32 s0, s0, 0xffffffe8 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v2, off, s0 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: sgpr_base_negative_offset: ; UNALIGNED_GFX11: ; %bb.0: ; %entry diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll deleted file mode 100644 index 676298670f1f..000000000000 --- llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ /dev/null @@ -1,132 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefix=GFX940 - -define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { -; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm - %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { -; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm - %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 - ret float %ret -} - -define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { -; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 - %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst - ret <2 x half> %result -} - -define void @local_atomic_fadd_noret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { -; GFX940-LABEL: local_atomic_fadd_noret_v2f16_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_f16 v0, v1 offset:65532 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 - %unused = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst - ret void -} - -define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) { -; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result -} - -define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) { -; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret void -} - -define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) { -; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i32 256 - %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result -} - -define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) { -; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i32 256 - %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst - ret void -} - -attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } - -!0 = !{} diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll new file mode 100644 index 000000000000..6792612ded36 --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefix=GFX942 + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { +; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm + %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { +; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm + %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { +; GFX942-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 + ret float %ret +} + +define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { +; GFX942-LABEL: local_atomic_fadd_ret_v2f16_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 + %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define void @local_atomic_fadd_noret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { +; GFX942-LABEL: local_atomic_fadd_noret_v2f16_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_pk_add_f16 v0, v1 offset:65532 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 + %unused = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst + ret void +} + +define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX942-LABEL: global_atomic_fadd_ret_v2f16_agent_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX942-LABEL: global_atomic_fadd_noret_v2f16_agent_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) { +; GFX942-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i32 256 + %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) { +; GFX942-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i32 256 + %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } + +!0 = !{} diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 8ff2f59964ab..7c54a8abe989 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX90A -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX940 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX942 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) @@ -27,16 +27,16 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -50,12 +50,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -78,20 +78,20 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -110,16 +110,16 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -133,12 +133,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -161,20 +161,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -193,16 +193,16 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -216,12 +216,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -244,20 +244,20 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -276,16 +276,16 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -299,12 +299,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -327,20 +327,20 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -359,16 +359,16 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -382,12 +382,12 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -410,20 +410,20 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -442,16 +442,16 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -465,12 +465,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -493,20 +493,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -525,16 +525,16 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -548,12 +548,12 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -576,20 +576,20 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -608,16 +608,16 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -631,12 +631,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -659,20 +659,20 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -691,16 +691,16 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -714,12 +714,12 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -742,20 +742,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -774,16 +774,16 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -797,12 +797,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -825,20 +825,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -857,16 +857,16 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -880,12 +880,12 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -908,20 +908,20 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -940,16 +940,16 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -963,12 +963,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -991,20 +991,20 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -1035,27 +1035,27 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: .LBB36_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB36_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB36_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB36_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: .LBB36_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1083,27 +1083,27 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: .LBB37_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB37_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB37_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB37_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: .LBB37_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1133,27 +1133,27 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: .LBB38_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB38_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB38_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB38_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: .LBB38_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1181,27 +1181,27 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB39_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_flush: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB39_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: .LBB39_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1220,15 +1220,15 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1245,15 +1245,15 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1272,15 +1272,15 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1308,27 +1308,27 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: .LBB43_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB43_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB43_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB43_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: .LBB43_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1349,17 +1349,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1378,17 +1378,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1409,17 +1409,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1438,15 +1438,15 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1463,15 +1463,15 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1491,16 +1491,16 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1519,17 +1519,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1556,25 +1556,25 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: .LBB51_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB51_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB51_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB51_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: .LBB51_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1601,25 +1601,25 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: .LBB52_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB52_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB52_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret_pat_flush: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB52_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: .LBB52_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1646,25 +1646,25 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: .LBB53_2: ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB53_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB53_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB53_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: .LBB53_2: +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1680,13 +1680,13 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_f64_rtn_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %ret diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 6459110dd8bb..dbfb4e6ebe7e 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s @@ -28,16 +28,16 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX90A-NEXT: S_ENDPGM 0 ; - ; GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): @@ -198,77 +198,77 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0 ; - ; GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE - ; GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.2 (%ir-block.5): - ; GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 - ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec - ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF]], implicit $exec - ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] - ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec - ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec - ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.3 (%ir-block.31): - ; GFX940-NEXT: successors: %bb.4(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.4.Flow: - ; GFX940-NEXT: successors: %bb.5(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.5 (%ir-block.33): - ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.5): + ; GFX942-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX942-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX942-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX942-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF]], implicit $exec + ; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX942-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX942-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec + ; GFX942-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec + ; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.3 (%ir-block.31): + ; GFX942-NEXT: successors: %bb.4(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.4.Flow: + ; GFX942-NEXT: successors: %bb.5(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.5 (%ir-block.33): + ; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index e935245e30f1..6cd03a2ecaff 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s @@ -17,17 +17,17 @@ define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %p ; GFX90A-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; - ; GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX942-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): @@ -136,96 +136,96 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: $vgpr0 = COPY [[PHI]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; - ; GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE - ; GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.2 (%ir-block.5): - ; GFX940-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 - ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec - ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF1]], implicit $exec - ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] - ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec - ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec - ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.3 (%ir-block.32): - ; GFX940-NEXT: successors: %bb.5(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: S_BRANCH %bb.5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.4.Flow: - ; GFX940-NEXT: successors: %bb.6(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1 - ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.6 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.5 (%ir-block.35): - ; GFX940-NEXT: successors: %bb.4(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 - ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX940-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX940-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY20]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY21]], [[V_CMP_EQ_U32_e64_]], implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.4 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.6 (%ir-block.41): - ; GFX940-NEXT: $vgpr0 = COPY [[PHI]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX942-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.5): + ; GFX942-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX942-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX942-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX942-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF1]], implicit $exec + ; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX942-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX942-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX942-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX942-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.3 (%ir-block.32): + ; GFX942-NEXT: successors: %bb.5(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: S_BRANCH %bb.5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.4.Flow: + ; GFX942-NEXT: successors: %bb.6(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1 + ; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.6 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.5 (%ir-block.35): + ; GFX942-NEXT: successors: %bb.4(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX942-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX942-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY20]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY21]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.4 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.6 (%ir-block.41): + ; GFX942-NEXT: $vgpr0 = COPY [[PHI]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index 49c5dc7ed5a9..e3bd3d4f2258 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw @@ -35,18 +35,18 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) ; GFX90A-NEXT: SI_END_CF [[PHI2]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0 ; - ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } @@ -91,66 +91,66 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; - ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX942-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX942: bb.1 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret double %ret } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll index 1317770ad834..9c0db4cd162f 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn @@ -15,16 +15,16 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -42,17 +42,17 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) in ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll index a65fc6c0c4cf..62620a8875a3 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll @@ -1,36 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn + ; GFX90A_GFX942: bb.1 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (<2 x s16>) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll index be0c9e2a602f..4ee658666a1b 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -O0 -stop-after=irtranslator -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -O0 -stop-after=irtranslator -o - %s | FileCheck %s define float @test_atomicrmw_fadd(ptr addrspace(3) %addr) { ; CHECK-LABEL: name: test_atomicrmw_fadd diff --git llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir similarity index 99% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir index 2944bb4ebbd1..e11586e464fb 100644 --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY --- name: mfma_i32_16x16x32_i8_vva diff --git llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir index e9a8248ef4e9..94fde7c4733a 100644 --- llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX908 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX90A %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX940 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX942 %s --- | define amdgpu_kernel void @a_to_v() #0 { ret void } @@ -64,11 +64,11 @@ body: | ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0 ; - ; GFX940-LABEL: name: a_to_v - ; GFX940: liveins: $agpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0 + ; GFX942-LABEL: name: a_to_v + ; GFX942: liveins: $agpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0 $vgpr0 = COPY killed $agpr0, implicit $exec S_ENDPGM 0, implicit $vgpr0 ... @@ -94,12 +94,12 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 ; - ; GFX940-LABEL: name: a2_to_v2 - ; GFX940: liveins: $agpr0_agpr1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 + ; GFX942-LABEL: name: a2_to_v2 + ; GFX942: liveins: $agpr0_agpr1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 $vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1 ... @@ -127,13 +127,13 @@ body: | ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ; - ; GFX940-LABEL: name: a3_to_v3 - ; GFX940: liveins: $agpr0_agpr1_agpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 + ; GFX942-LABEL: name: a3_to_v3 + ; GFX942: liveins: $agpr0_agpr1_agpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 = COPY killed $agpr0_agpr1_agpr2, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ... @@ -162,14 +162,14 @@ body: | ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; - ; GFX940-LABEL: name: a4_to_v4 - ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-LABEL: name: a4_to_v4 + ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ... @@ -207,18 +207,18 @@ body: | ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; - ; GFX940-LABEL: name: a8_to_v8 - ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-LABEL: name: a8_to_v8 + ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ... @@ -271,26 +271,26 @@ body: | ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; - ; GFX940-LABEL: name: a16_to_v16 - ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-LABEL: name: a16_to_v16 + ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ... @@ -313,11 +313,11 @@ body: | ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0 ; - ; GFX940-LABEL: name: v_to_a - ; GFX940: liveins: $vgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0 + ; GFX942-LABEL: name: v_to_a + ; GFX942: liveins: $vgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0 $agpr0 = COPY killed $vgpr0, implicit $exec S_ENDPGM 0, implicit $agpr0 ... @@ -342,12 +342,12 @@ body: | ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; - ; GFX940-LABEL: name: v2_to_a2 - ; GFX940: liveins: $vgpr0_vgpr1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 + ; GFX942-LABEL: name: v2_to_a2 + ; GFX942: liveins: $vgpr0_vgpr1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 ... @@ -374,13 +374,13 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; - ; GFX940-LABEL: name: v3_to_a3 - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX942-LABEL: name: v3_to_a3 + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $vgpr0_vgpr1_vgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ... @@ -409,14 +409,14 @@ body: | ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; - ; GFX940-LABEL: name: v4_to_a4 - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-LABEL: name: v4_to_a4 + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -453,18 +453,18 @@ body: | ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; - ; GFX940-LABEL: name: v8_to_a8 - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-LABEL: name: v8_to_a8 + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ... @@ -517,26 +517,26 @@ body: | ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; - ; GFX940-LABEL: name: v16_to_a16 - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX940-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-LABEL: name: v16_to_a16 + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ... @@ -560,11 +560,11 @@ body: | ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0 ; - ; GFX940-LABEL: name: s_to_a - ; GFX940: liveins: $sgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0 + ; GFX942-LABEL: name: s_to_a + ; GFX942: liveins: $sgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0 $agpr0 = COPY killed $sgpr0, implicit $exec S_ENDPGM 0, implicit $agpr0 ... @@ -591,12 +591,12 @@ body: | ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; - ; GFX940-LABEL: name: s2_to_a2 - ; GFX940: liveins: $sgpr0_sgpr1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 + ; GFX942-LABEL: name: s2_to_a2 + ; GFX942: liveins: $sgpr0_sgpr1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 ... @@ -626,13 +626,13 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; - ; GFX940-LABEL: name: s3_to_a3 - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX942-LABEL: name: s3_to_a3 + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ... @@ -665,14 +665,14 @@ body: | ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; - ; GFX940-LABEL: name: s4_to_a4 - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-LABEL: name: s4_to_a4 + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -711,16 +711,16 @@ body: | ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; - ; GFX940-LABEL: name: s6_to_a6 - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX942-LABEL: name: s6_to_a6 + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ... @@ -765,18 +765,18 @@ body: | ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; - ; GFX940-LABEL: name: s8_to_a8 - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-LABEL: name: s8_to_a8 + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ... @@ -845,26 +845,26 @@ body: | ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; - ; GFX940-LABEL: name: s16_to_a16 - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX940-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-LABEL: name: s16_to_a16 + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ... @@ -885,10 +885,10 @@ body: | ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0 ; - ; GFX940-LABEL: name: a_to_a - ; GFX940: $agpr1 = IMPLICIT_DEF - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0 + ; GFX942-LABEL: name: a_to_a + ; GFX942: $agpr1 = IMPLICIT_DEF + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0 $agpr1 = IMPLICIT_DEF $agpr0 = COPY killed $agpr1, implicit $exec S_ENDPGM 0, implicit $agpr0 @@ -918,13 +918,13 @@ body: | ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; - ; GFX940-LABEL: name: a2_to_a2 - ; GFX940: liveins: $agpr0_agpr1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 + ; GFX942-LABEL: name: a2_to_a2 + ; GFX942: liveins: $agpr0_agpr1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY $agpr0_agpr1, implicit $exec $agpr3 = COPY $agpr2 S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -954,13 +954,13 @@ body: | ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; - ; GFX940-LABEL: name: a2_to_a2_kill - ; GFX940: liveins: $agpr0_agpr1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 + ; GFX942-LABEL: name: a2_to_a2_kill + ; GFX942: liveins: $agpr0_agpr1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec $agpr3 = COPY $agpr2 S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -994,15 +994,15 @@ body: | ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec ; - ; GFX940-LABEL: name: a2_to_a2_implicit_defs - ; GFX940: liveins: $agpr0_agpr1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec + ; GFX942-LABEL: name: a2_to_a2_implicit_defs + ; GFX942: liveins: $agpr0_agpr1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 @@ -1035,13 +1035,13 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; - ; GFX940-LABEL: name: a3_to_a3_nonoverlap_kill - ; GFX940: liveins: $agpr4_agpr5_agpr6 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX942-LABEL: name: a3_to_a3_nonoverlap_kill + ; GFX942: liveins: $agpr4_agpr5_agpr6 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $agpr4_agpr5_agpr6 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ... @@ -1073,14 +1073,14 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 ; - ; GFX940-LABEL: name: a3_to_a3_overlap_kill - ; GFX940: liveins: $agpr1_agpr2_agpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 + ; GFX942-LABEL: name: a3_to_a3_overlap_kill + ; GFX942: liveins: $agpr1_agpr2_agpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 $agpr0_agpr1_agpr2 = COPY killed $agpr1_agpr2_agpr3 $vgpr1 = COPY $agpr1 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 @@ -1111,13 +1111,13 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ; - ; GFX940-LABEL: name: a4_to_a4 - ; GFX940: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX942-LABEL: name: a4_to_a4 + ; GFX942: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 @@ -1151,14 +1151,14 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ; - ; GFX940-LABEL: name: a4_to_a4_overlap - ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 + ; GFX942-LABEL: name: a4_to_a4_overlap + ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ... @@ -1200,17 +1200,17 @@ body: | ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; - ; GFX940-LABEL: name: a8_to_a8 - ; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX940-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX940-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-LABEL: name: a8_to_a8 + ; GFX942: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -1278,25 +1278,25 @@ body: | ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; - ; GFX940-LABEL: name: a16_to_a16 - ; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX940-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX940-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX942-LABEL: name: a16_to_a16 + ; GFX942: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; GFX942-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX942-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 @@ -1326,12 +1326,12 @@ body: | ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0 ; - ; GFX940-LABEL: name: a_to_a_spill - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $agpr1 = IMPLICIT_DEF - ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0 + ; GFX942-LABEL: name: a_to_a_spill + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $agpr1 = IMPLICIT_DEF + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0 $agpr1 = IMPLICIT_DEF $agpr0 = COPY killed $agpr1, implicit $exec S_ENDPGM 0, implicit $agpr0 @@ -1368,15 +1368,15 @@ body: | ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; - ; GFX940-LABEL: name: copy_sgpr_to_agpr_tuple - ; GFX940: liveins: $agpr0, $sgpr2_sgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-LABEL: name: copy_sgpr_to_agpr_tuple + ; GFX942: liveins: $agpr0, $sgpr2_sgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 S_NOP 0, implicit-def dead $sgpr0_sgpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 @@ -1412,15 +1412,15 @@ body: | ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; - ; GFX940-LABEL: name: copy_sgpr_to_agpr_tuple_kill - ; GFX940: liveins: $agpr0, $sgpr2_sgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + ; GFX942-LABEL: name: copy_sgpr_to_agpr_tuple_kill + ; GFX942: liveins: $agpr0, $sgpr2_sgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $sgpr0_sgpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 @@ -1457,15 +1457,15 @@ body: | ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ; - ; GFX940-LABEL: name: copy_agpr_to_agpr_tuple - ; GFX940: liveins: $agpr0, $agpr2_agpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-LABEL: name: copy_agpr_to_agpr_tuple + ; GFX942: liveins: $agpr0, $agpr2_agpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 S_NOP 0, implicit-def dead $agpr0_agpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 @@ -1502,15 +1502,15 @@ body: | ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; - ; GFX940-LABEL: name: copy_agpr_to_agpr_tuple_kill - ; GFX940: liveins: $agpr0, $agpr2_agpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX940-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec - ; GFX940-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + ; GFX942-LABEL: name: copy_agpr_to_agpr_tuple_kill + ; GFX942: liveins: $agpr0, $agpr2_agpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $agpr0_agpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 diff --git llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index 835e5e5f06ef..0114de738ce8 100644 --- llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s ; OBJDUMP: Contents of section .rodata: ; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................ diff --git llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index f5c9b1a79b47..5f56568ef88e 100644 --- llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1200 %s @@ -80,14 +80,14 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: syncscope_system: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: syncscope_system: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_system: ; GFX1100: ; %bb.0: @@ -187,12 +187,12 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: syncscope_workgroup_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: syncscope_workgroup_rtn: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_workgroup_rtn: ; GFX1100: ; %bb.0: @@ -320,12 +320,12 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: syncscope_workgroup_nortn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: syncscope_workgroup_nortn: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_workgroup_nortn: ; GFX1100: ; %bb.0: @@ -396,12 +396,12 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: no_unsafe: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: no_unsafe: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: no_unsafe: ; GFX1100: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll index 632f7dbc5337..80fb20f93b3e 100644 --- llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll +++ llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX942-BACKOFF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-BACKOFF %s @@ -30,15 +30,15 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 { ; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-BACKOFF-LABEL: back_off_barrier_no_fence: -; GFX940-BACKOFF: ; %bb.0: -; GFX940-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX940-BACKOFF-NEXT: s_barrier -; GFX940-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-BACKOFF-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 -; GFX940-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; GFX942-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX942-BACKOFF: ; %bb.0: +; GFX942-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX942-BACKOFF-NEXT: s_barrier +; GFX942-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-BACKOFF-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 +; GFX942-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-BACKOFF-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence: ; GFX10-BACKOFF: ; %bb.0: @@ -88,16 +88,16 @@ define void @back_off_barrier_with_fence(ptr %in, ptr %out) #0 { ; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-BACKOFF-LABEL: back_off_barrier_with_fence: -; GFX940-BACKOFF: ; %bb.0: -; GFX940-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-BACKOFF-NEXT: flat_load_dword v0, v[0:1] -; GFX940-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-BACKOFF-NEXT: s_barrier -; GFX940-BACKOFF-NEXT: s_waitcnt vmcnt(0) -; GFX940-BACKOFF-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 -; GFX940-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; GFX942-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX942-BACKOFF: ; %bb.0: +; GFX942-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX942-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-BACKOFF-NEXT: s_barrier +; GFX942-BACKOFF-NEXT: s_waitcnt vmcnt(0) +; GFX942-BACKOFF-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 +; GFX942-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-BACKOFF-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence: ; GFX10-BACKOFF: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/bf16-conversions.ll llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 59e47cbc12b2..74164016f9a9 100644 --- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s ; TODO: Add global-isel when it can support bf16 @@ -24,24 +24,24 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { } define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { -; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_v: -; GFX-940: ; %bb.0: -; GFX-940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v2, v2, v0, s0 -; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX-940-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX-940-NEXT: v_add3_u32 v2, v2, v1, s0 -; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 -; GFX-940-NEXT: s_nop 0 -; GFX-940-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX-940-NEXT: ; return to shader part epilog +; GFX-942-LABEL: v_test_cvt_v2f32_v2bf16_v: +; GFX-942: ; %bb.0: +; GFX-942-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v2, v2, v0, s0 +; GFX-942-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX-942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX-942-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX-942-NEXT: v_add3_u32 v2, v2, v1, s0 +; GFX-942-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX-942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX-942-NEXT: s_mov_b32 s0, 0x7060302 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX-942-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX-942-NEXT: ; return to shader part epilog ; ; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_v: ; GFX-950: ; %bb.0: @@ -53,27 +53,27 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { } define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { -; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_s: -; GFX-940: ; %bb.0: -; GFX-940-NEXT: s_bfe_u32 s2, s1, 0x10010 -; GFX-940-NEXT: s_add_i32 s2, s2, s1 -; GFX-940-NEXT: s_or_b32 s4, s1, 0x400000 -; GFX-940-NEXT: s_add_i32 s5, s2, 0x7fff -; GFX-940-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 -; GFX-940-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX-940-NEXT: s_cselect_b32 s1, s4, s5 -; GFX-940-NEXT: s_lshr_b32 s2, s1, 16 -; GFX-940-NEXT: s_bfe_u32 s1, s0, 0x10010 -; GFX-940-NEXT: s_add_i32 s1, s1, s0 -; GFX-940-NEXT: s_or_b32 s3, s0, 0x400000 -; GFX-940-NEXT: s_add_i32 s4, s1, 0x7fff -; GFX-940-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 -; GFX-940-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX-940-NEXT: s_cselect_b32 s0, s3, s4 -; GFX-940-NEXT: s_lshr_b32 s0, s0, 16 -; GFX-940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX-940-NEXT: v_mov_b32_e32 v0, s0 -; GFX-940-NEXT: ; return to shader part epilog +; GFX-942-LABEL: v_test_cvt_v2f32_v2bf16_s: +; GFX-942: ; %bb.0: +; GFX-942-NEXT: s_bfe_u32 s2, s1, 0x10010 +; GFX-942-NEXT: s_add_i32 s2, s2, s1 +; GFX-942-NEXT: s_or_b32 s4, s1, 0x400000 +; GFX-942-NEXT: s_add_i32 s5, s2, 0x7fff +; GFX-942-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 +; GFX-942-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX-942-NEXT: s_cselect_b32 s1, s4, s5 +; GFX-942-NEXT: s_lshr_b32 s2, s1, 16 +; GFX-942-NEXT: s_bfe_u32 s1, s0, 0x10010 +; GFX-942-NEXT: s_add_i32 s1, s1, s0 +; GFX-942-NEXT: s_or_b32 s3, s0, 0x400000 +; GFX-942-NEXT: s_add_i32 s4, s1, 0x7fff +; GFX-942-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 +; GFX-942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX-942-NEXT: s_cselect_b32 s0, s3, s4 +; GFX-942-NEXT: s_lshr_b32 s0, s0, 16 +; GFX-942-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX-942-NEXT: v_mov_b32_e32 v0, s0 +; GFX-942-NEXT: ; return to shader part epilog ; ; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_s: ; GFX-950: ; %bb.0: @@ -86,17 +86,17 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { } define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { -; GFX-940-LABEL: v_test_cvt_f32_bf16_v: -; GFX-940: ; %bb.0: -; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 -; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX-940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX-940-NEXT: ; return to shader part epilog +; GFX-942-LABEL: v_test_cvt_f32_bf16_v: +; GFX-942: ; %bb.0: +; GFX-942-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v1, v1, v0, s0 +; GFX-942-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX-942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX-942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX-942-NEXT: ; return to shader part epilog ; ; GFX-950-LABEL: v_test_cvt_f32_bf16_v: ; GFX-950: ; %bb.0: @@ -109,47 +109,47 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { } define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { -; GFX-940-LABEL: v_test_cvt_v2f64_v2bf16_v: -; GFX-940: ; %bb.0: -; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| -; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX-940-NEXT: s_brev_b32 s4, 1 -; GFX-940-NEXT: v_and_or_b32 v5, v1, s4, v4 -; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GFX-940-NEXT: s_movk_i32 s5, 0x7fff -; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s5 -; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX-940-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]| -; GFX-940-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 -; GFX-940-NEXT: v_and_b32_e32 v6, 1, v5 -; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1] -; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1] -; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX-940-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-940-NEXT: v_add_u32_e32 v0, v5, v0 -; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX-940-NEXT: v_and_or_b32 v1, v3, s4, v0 -; GFX-940-NEXT: v_bfe_u32 v0, v0, 16, 1 -; GFX-940-NEXT: v_add3_u32 v0, v0, v1, s5 -; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] -; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 -; GFX-940-NEXT: s_nop 0 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX-940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX-940-NEXT: ; return to shader part epilog +; GFX-942-LABEL: v_test_cvt_v2f64_v2bf16_v: +; GFX-942: ; %bb.0: +; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-942-NEXT: s_brev_b32 s4, 1 +; GFX-942-NEXT: v_and_or_b32 v5, v1, s4, v4 +; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-942-NEXT: s_movk_i32 s5, 0x7fff +; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s5 +; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX-942-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]| +; GFX-942-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 +; GFX-942-NEXT: v_and_b32_e32 v6, 1, v5 +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1] +; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1] +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX-942-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX-942-NEXT: v_add_u32_e32 v0, v5, v0 +; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX-942-NEXT: v_and_or_b32 v1, v3, s4, v0 +; GFX-942-NEXT: v_bfe_u32 v0, v0, 16, 1 +; GFX-942-NEXT: v_add3_u32 v0, v0, v1, s5 +; GFX-942-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] +; GFX-942-NEXT: s_mov_b32 s0, 0x7060302 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX-942-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX-942-NEXT: ; return to shader part epilog ; ; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v: ; GFX-950: ; %bb.0: @@ -163,24 +163,24 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { } define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { -; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v2, v2, v0, s0 -; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX-940-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX-940-NEXT: v_add3_u32 v2, v2, v1, s0 -; GFX-940-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 -; GFX-940-NEXT: s_nop 0 -; GFX-940-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX-940-NEXT: ; return to shader part epilog +; GFX-942-LABEL: fptrunc_f32_f32_to_v2bf16: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v2, v2, v0, s0 +; GFX-942-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX-942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX-942-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX-942-NEXT: v_add3_u32 v2, v2, v1, s0 +; GFX-942-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX-942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX-942-NEXT: s_mov_b32 s0, 0x7060302 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX-942-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX-942-NEXT: ; return to shader part epilog ; ; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16: ; GFX-950: ; %bb.0: ; %entry @@ -196,26 +196,26 @@ entry: } define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { -; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16_mods: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 -; GFX-940-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v3, v3, v2, s0 -; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX-940-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 -; GFX-940-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX-940-NEXT: v_add3_u32 v3, v3, v2, s0 -; GFX-940-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1| -; GFX-940-NEXT: s_mov_b32 s0, 0x7060302 -; GFX-940-NEXT: s_nop 0 -; GFX-940-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX-940-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX-940-NEXT: ; return to shader part epilog +; GFX-942-LABEL: fptrunc_f32_f32_to_v2bf16_mods: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GFX-942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v3, v3, v2, s0 +; GFX-942-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX-942-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX-942-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; GFX-942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX-942-NEXT: v_add3_u32 v3, v3, v2, s0 +; GFX-942-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX-942-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1| +; GFX-942-NEXT: s_mov_b32 s0, 0x7060302 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX-942-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX-942-NEXT: ; return to shader part epilog ; ; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16_mods: ; GFX-950: ; %bb.0: ; %entry @@ -233,19 +233,19 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { -; GFX-940-LABEL: fptrunc_f32_to_bf16: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_mov_b32_e32 v3, v2 -; GFX-940-NEXT: v_mov_b32_e32 v2, v1 -; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 -; GFX-940-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GFX-940-NEXT: s_endpgm +; GFX-942-LABEL: fptrunc_f32_to_bf16: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_mov_b32_e32 v3, v2 +; GFX-942-NEXT: v_mov_b32_e32 v2, v1 +; GFX-942-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v1, v1, v0, s0 +; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX-942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-942-NEXT: s_endpgm ; ; GFX-950-LABEL: fptrunc_f32_to_bf16: ; GFX-950: ; %bb.0: ; %entry @@ -261,20 +261,20 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { -; GFX-940-LABEL: fptrunc_f32_to_bf16_abs: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_mov_b32_e32 v3, v2 -; GFX-940-NEXT: v_mov_b32_e32 v2, v1 -; GFX-940-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 -; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 -; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GFX-940-NEXT: s_endpgm +; GFX-942-LABEL: fptrunc_f32_to_bf16_abs: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_mov_b32_e32 v3, v2 +; GFX-942-NEXT: v_mov_b32_e32 v2, v1 +; GFX-942-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GFX-942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-942-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-942-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-942-NEXT: s_endpgm ; ; GFX-950-LABEL: fptrunc_f32_to_bf16_abs: ; GFX-950: ; %bb.0: ; %entry @@ -291,20 +291,20 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { -; GFX-940-LABEL: fptrunc_f32_to_bf16_neg: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_mov_b32_e32 v3, v2 -; GFX-940-NEXT: v_mov_b32_e32 v2, v1 -; GFX-940-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 -; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 -; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GFX-940-NEXT: s_endpgm +; GFX-942-LABEL: fptrunc_f32_to_bf16_neg: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_mov_b32_e32 v3, v2 +; GFX-942-NEXT: v_mov_b32_e32 v2, v1 +; GFX-942-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX-942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-942-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-942-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-942-NEXT: s_endpgm ; ; GFX-950-LABEL: fptrunc_f32_to_bf16_neg: ; GFX-950: ; %bb.0: ; %entry @@ -321,29 +321,29 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { -; GFX-940-LABEL: fptrunc_f64_to_bf16: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| -; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX-940-NEXT: s_brev_b32 s0, 1 -; GFX-940-NEXT: v_and_or_b32 v5, v1, s0, v4 -; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 -; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GFX-940-NEXT: s_endpgm +; GFX-942-LABEL: fptrunc_f64_to_bf16: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-942-NEXT: s_brev_b32 s0, 1 +; GFX-942-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-942-NEXT: s_endpgm ; ; GFX-950-LABEL: fptrunc_f64_to_bf16: ; GFX-950: ; %bb.0: ; %entry @@ -358,30 +358,30 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { -; GFX-940-LABEL: fptrunc_f64_to_bf16_neg: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| -; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 -; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 -; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-940-NEXT: s_brev_b32 s4, 1 -; GFX-940-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 -; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX-940-NEXT: v_and_or_b32 v5, v6, s4, v4 -; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 -; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GFX-940-NEXT: s_endpgm +; GFX-942-LABEL: fptrunc_f64_to_bf16_neg: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-942-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-942-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-942-NEXT: s_brev_b32 s4, 1 +; GFX-942-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-942-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-942-NEXT: s_endpgm ; ; GFX-950-LABEL: fptrunc_f64_to_bf16_neg: ; GFX-950: ; %bb.0: ; %entry @@ -397,30 +397,30 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { -; GFX-940-LABEL: fptrunc_f64_to_bf16_abs: -; GFX-940: ; %bb.0: ; %entry -; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| -; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 -; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 -; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-940-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 -; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX-940-NEXT: s_brev_b32 s0, 1 -; GFX-940-NEXT: v_and_or_b32 v5, v6, s0, v4 -; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GFX-940-NEXT: s_movk_i32 s0, 0x7fff -; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 -; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| -; GFX-940-NEXT: s_nop 1 -; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GFX-940-NEXT: s_endpgm +; GFX-942-LABEL: fptrunc_f64_to_bf16_abs: +; GFX-942: ; %bb.0: ; %entry +; GFX-942-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-942-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-942-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-942-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-942-NEXT: s_brev_b32 s0, 1 +; GFX-942-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GFX-942-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-942-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-942-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-942-NEXT: s_endpgm ; ; GFX-950-LABEL: fptrunc_f64_to_bf16_abs: ; GFX-950: ; %bb.0: ; %entry diff --git llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll index 85a701b23a6c..a14114358433 100644 --- llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { @@ -23,19 +23,19 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32 ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -56,20 +56,20 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32> ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -90,20 +90,20 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32> ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -126,22 +126,22 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32 ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -167,25 +167,25 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -212,26 +212,26 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -258,26 +258,26 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -306,28 +306,28 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr ; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } diff --git llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll index 417dee573c5d..eb452dc4b874 100644 --- llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll @@ -1,26 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offset_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn ; GFX11: bb.0 (%ir-block.0): @@ -41,21 +41,21 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> } define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_offen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn ; GFX11: bb.0 (%ir-block.0): @@ -77,21 +77,21 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i } define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn ; GFX11: bb.0 (%ir-block.0): @@ -113,23 +113,23 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i } define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn ; GFX11: bb.0 (%ir-block.0): @@ -153,26 +153,26 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn ; GFX11: bb.0 (%ir-block.0): @@ -199,27 +199,27 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn ; GFX11: bb.0 (%ir-block.0): @@ -247,27 +247,27 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn ; GFX11: bb.0 (%ir-block.0): @@ -295,29 +295,29 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add } define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn ; GFX11: bb.0 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll index ff087715e0fc..6885657bbfa3 100644 --- llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll @@ -1,441 +1,441 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_idxen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_bothen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offset_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY8]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY9]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY8]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY9]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_offen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY9]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY10]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_idxen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY9]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY10]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_idxen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f64_bothen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY10]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY11]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_bothen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY10]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY11]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offset_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_idxen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_bothen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY12]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY13]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY12]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY13]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY13]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY14]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY13]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY14]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret double %ret } define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY14]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY15]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY14]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY15]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret double %ret } diff --git llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll index 44fddc372293..790cd8ef9ecc 100644 --- llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn @@ -21,19 +21,19 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, < ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } @@ -54,20 +54,20 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4 ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -88,20 +88,20 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4 ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -124,22 +124,22 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, < ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -165,25 +165,25 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void } @@ -210,26 +210,26 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -256,26 +256,26 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -304,28 +304,28 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va ; GFX908-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } diff --git llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll index c4ef1390a288..89e1a4be4e16 100644 --- llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll +++ llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll @@ -1,191 +1,191 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_idxen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_bothen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret <2 x half> %ret } define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret <2 x half> %ret } diff --git llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0ea73ad4c501..ba2694fca99f 100644 --- llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -28,15 +28,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -216,15 +216,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -403,33 +403,33 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -799,15 +799,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1004,15 +1004,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1203,15 +1203,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: @@ -1427,15 +1427,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1651,15 +1651,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -1900,15 +1900,15 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2149,15 +2149,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2428,36 +2428,36 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2920,15 +2920,15 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -3189,15 +3189,15 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -3456,39 +3456,39 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3804,38 +3804,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4179,72 +4179,72 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v11, v6 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v8 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX942-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v11, v6 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX942-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v8 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB15_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4798,47 +4798,47 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB16_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5202,46 +5202,46 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB17_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5634,80 +5634,80 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX942-NEXT: s_movk_i32 s10, 0x7fff +; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB18_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6256,15 +6256,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6505,15 +6505,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6756,33 +6756,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7235,15 +7235,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX11: ; %bb.0: @@ -7501,15 +7501,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX11: ; %bb.0: @@ -7764,15 +7764,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -8030,15 +8030,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -8297,50 +8297,50 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB26_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8680,49 +8680,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB27_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9075,83 +9075,83 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB28_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB28_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX942-NEXT: s_movk_i32 s10, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX942-NEXT: s_mov_b32 s11, 0x7060302 +; GFX942-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX942-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB28_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB28_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9713,50 +9713,50 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB29_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11: ; %bb.0: @@ -10096,49 +10096,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB30_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11: ; %bb.0: @@ -10470,50 +10470,50 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB31_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -10853,49 +10853,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB32_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -11227,49 +11227,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB33_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -11606,15 +11606,15 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 7f06d169a6b1..38adf60888ec 100644 --- llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -28,34 +28,34 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB0_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -203,33 +203,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB1_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -395,66 +395,66 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 -; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX940-NEXT: v_max_f32_e32 v6, v4, v9 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_max_f32_e32 v9, v5, v5 +; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX942-NEXT: v_max_f32_e32 v6, v4, v9 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB2_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -766,34 +766,34 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB3_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1023,34 +1023,34 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1226,15 +1226,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1417,15 +1417,15 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1643,36 +1643,36 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB7_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2033,15 +2033,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -2320,15 +2320,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -2529,41 +2529,41 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2892,40 +2892,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB11_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3283,74 +3283,74 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB12_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_max_f16_e32 v11, v5, v5 +; GFX942-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v11 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB12_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3915,47 +3915,47 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4321,46 +4321,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4755,80 +4755,80 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_max_f32_e32 v4, v4, v11 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX942-NEXT: s_movk_i32 s10, 0x7fff +; GFX942-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_max_f32_e32 v4, v4, v11 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB15_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5402,35 +5402,35 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB16_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5720,34 +5720,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB17_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6071,66 +6071,66 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_pk_max_f16 v9, v5, v5 +; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_pk_max_f16 v6, v4, v9 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB18_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6669,50 +6669,50 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB19_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7089,49 +7089,49 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB20_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7542,83 +7542,83 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX942-NEXT: s_movk_i32 s10, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX942-NEXT: s_mov_b32 s11, 0x7060302 +; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v9 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v10 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB21_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8185,34 +8185,34 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB22_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index a6eb81fcbf51..2b8cea9068d8 100644 --- llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -28,34 +28,34 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB0_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -203,33 +203,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB1_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -395,66 +395,66 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 -; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX940-NEXT: v_min_f32_e32 v6, v4, v9 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_max_f32_e32 v9, v5, v5 +; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 +; GFX942-NEXT: v_min_f32_e32 v6, v4, v9 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB2_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -766,34 +766,34 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB3_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1023,34 +1023,34 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1226,15 +1226,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1417,15 +1417,15 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1643,36 +1643,36 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v6 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB7_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2033,15 +2033,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -2320,15 +2320,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -2529,41 +2529,41 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX942-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2892,40 +2892,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX942-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB11_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3283,74 +3283,74 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB12_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_max_f16_e32 v11, v5, v5 +; GFX942-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v11 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB12_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3915,47 +3915,47 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4321,46 +4321,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_addk_i32 s16, 0x200 +; GFX942-NEXT: s_and_b32 s4, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s4, s16, 3 +; GFX942-NEXT: s_lshl_b32 s6, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4755,80 +4755,80 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_min_f32_e32 v4, v4, v11 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX942-NEXT: s_movk_i32 s10, 0x7fff +; GFX942-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_min_f32_e32 v4, v4, v11 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB15_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5402,35 +5402,35 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB16_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5720,34 +5720,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB17_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6071,66 +6071,66 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_pk_max_f16 v9, v5, v5 +; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: v_pk_min_f16 v6, v4, v9 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB18_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6669,50 +6669,50 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB19_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7089,49 +7089,49 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[6:7], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: s_mov_b32 s9, 0x7060302 +; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_cbranch_execnz .LBB20_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7542,83 +7542,83 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: s_mov_b64 s[2:3], exec +; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: ; implicit-def: $vgpr4 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b64 exec, s[2:3] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX942-NEXT: s_movk_i32 s10, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX942-NEXT: s_mov_b32 s11, 0x7060302 +; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Loop Header: Depth=1 +; GFX942-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v9 +; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_mov_b64 s[8:9], exec +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v10 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX942-NEXT: v_readfirstlane_b32 s4, v0 +; GFX942-NEXT: v_readfirstlane_b32 s5, v1 +; GFX942-NEXT: v_readfirstlane_b32 s6, v2 +; GFX942-NEXT: v_readfirstlane_b32 s7, v3 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_4 +; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX942-NEXT: s_mov_b64 exec, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB21_3 +; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8185,34 +8185,34 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s16 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s6, s16, 0x400 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s6, s16, 0x400 +; GFX942-NEXT: s_mov_b64 s[4:5], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_cbranch_execnz .LBB22_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/build_vector.ll llvm/test/CodeGen/AMDGPU/build_vector.ll index 4ab940288e8c..84bd682f623e 100644 --- llvm/test/CodeGen/AMDGPU/build_vector.ll +++ llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefixes=GFX8,GCN ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 | FileCheck %s --check-prefixes=GFX10,GCN ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 | FileCheck %s --check-prefixes=GFX11,GCN -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,GCN +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s --check-prefixes=GFX942,GCN define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: @@ -48,15 +48,15 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX940-LABEL: build_vector2: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, 6 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: build_vector2: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 5 +; GFX942-NEXT: v_mov_b32_e32 v1, 6 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm entry: store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out ret void @@ -113,17 +113,17 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX940-LABEL: build_vector4: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, 6 -; GFX940-NEXT: v_mov_b32_e32 v2, 7 -; GFX940-NEXT: v_mov_b32_e32 v3, 8 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: build_vector4: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 5 +; GFX942-NEXT: v_mov_b32_e32 v1, 6 +; GFX942-NEXT: v_mov_b32_e32 v2, 7 +; GFX942-NEXT: v_mov_b32_e32 v3, 8 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm entry: store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out ret void @@ -168,14 +168,14 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX940-LABEL: build_vector_v2i16: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: build_vector_v2i16: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x60005 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm entry: store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out ret void @@ -232,17 +232,17 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX940-LABEL: build_vector_v2i16_trunc: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s2, s[4:5], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s2, s2, 16 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: build_vector_v2i16_trunc: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshr_b32 s2, s2, 16 +; GFX942-NEXT: s_pack_ll_b32_b16 s2, s2, 5 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 %ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0 @@ -304,17 +304,17 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; -; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s3, s3, 16 -; GFX940-NEXT: s_lshl_b32 s2, s2, 16 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: build_v2i32_from_v4i16_shuffle: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s3, s3, 16 +; GFX942-NEXT: s_lshl_b32 s2, s2, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2> %zextended = zext <2 x i16> %shuf to <2 x i32> diff --git llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir index 7c21b3e08580..d5dfb5dd0848 100644 --- llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX940 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX942 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s @@ -22,10 +22,10 @@ body: | ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_v64_to_v64 - ; GFX940: liveins: $vgpr2_vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec + ; GFX942-LABEL: name: copy_v64_to_v64 + ; GFX942: liveins: $vgpr2_vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_to_v64 ; GFX10: liveins: $vgpr2_vgpr3 @@ -52,10 +52,10 @@ body: | ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_s64_to_v64 - ; GFX940: liveins: $sgpr2_sgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr2_sgpr3, implicit $exec, implicit $exec + ; GFX942-LABEL: name: copy_s64_to_v64 + ; GFX942: liveins: $sgpr2_sgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr2_sgpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s64_to_v64 ; GFX10: liveins: $sgpr2_sgpr3 @@ -83,11 +83,11 @@ body: | ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_a64_to_v64 - ; GFX940: liveins: $agpr2_agpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX942-LABEL: name: copy_a64_to_v64 + ; GFX942: liveins: $agpr2_agpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec ; ; GFX10-LABEL: name: copy_a64_to_v64 ; GFX10: liveins: $agpr2_agpr3 @@ -117,11 +117,11 @@ body: | ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec ; - ; GFX940-LABEL: name: copy_v128_to_v128_fwd - ; GFX940: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX942-LABEL: name: copy_v128_to_v128_fwd + ; GFX942: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_fwd ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 @@ -153,11 +153,11 @@ body: | ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_v128_to_v128_back - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX942-LABEL: name: copy_v128_to_v128_back + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_back ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -189,12 +189,12 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec ; - ; GFX940-LABEL: name: copy_v96_to_v96 - ; GFX940: liveins: $vgpr4_vgpr5_vgpr6 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX942-LABEL: name: copy_v96_to_v96 + ; GFX942: liveins: $vgpr4_vgpr5_vgpr6 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec ; ; GFX10-LABEL: name: copy_v96_to_v96 ; GFX10: liveins: $vgpr4_vgpr5_vgpr6 @@ -222,10 +222,10 @@ body: | ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub0 - ; GFX940: liveins: $vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec + ; GFX942-LABEL: name: copy_v64_to_v64_undef_sub0 + ; GFX942: liveins: $vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX10: liveins: $vgpr3 @@ -252,10 +252,10 @@ body: | ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub1 - ; GFX940: liveins: $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec + ; GFX942-LABEL: name: copy_v64_to_v64_undef_sub1 + ; GFX942: liveins: $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX10: liveins: $vgpr2 @@ -285,11 +285,11 @@ body: | ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 ; - ; GFX940-LABEL: name: copy_s128_to_v128_killed - ; GFX940: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-LABEL: name: copy_s128_to_v128_killed + ; GFX942: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 ; ; GFX10-LABEL: name: copy_s128_to_v128_killed ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 @@ -319,11 +319,11 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec ; - ; GFX940-LABEL: name: copy_v64_to_v64_unaligned - ; GFX940: liveins: $vgpr2_vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX942-LABEL: name: copy_v64_to_v64_unaligned + ; GFX942: liveins: $vgpr2_vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec ; ; GFX10-LABEL: name: copy_v64_to_v64_unaligned ; GFX10: liveins: $vgpr2_vgpr3 @@ -351,11 +351,11 @@ body: | ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec ; - ; GFX940-LABEL: name: copy_v64_unaligned_to_v64 - ; GFX940: liveins: $vgpr3_vgpr4 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX942-LABEL: name: copy_v64_unaligned_to_v64 + ; GFX942: liveins: $vgpr3_vgpr4 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec ; ; GFX10-LABEL: name: copy_v64_unaligned_to_v64 ; GFX10: liveins: $vgpr3_vgpr4 @@ -387,13 +387,13 @@ body: | ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec ; - ; GFX940-LABEL: name: copy_v128_to_v128_unaligned - ; GFX940: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX942-LABEL: name: copy_v128_to_v128_unaligned + ; GFX942: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 @@ -427,13 +427,13 @@ body: | ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec ; - ; GFX940-LABEL: name: copy_v128_unaligned_to_v128 - ; GFX940: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX942-LABEL: name: copy_v128_unaligned_to_v128 + ; GFX942: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec ; ; GFX10-LABEL: name: copy_v128_unaligned_to_v128 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 @@ -463,11 +463,11 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec ; - ; GFX940-LABEL: name: copy_s64_to_v64_unaligned - ; GFX940: liveins: $sgpr8_sgpr9 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX942-LABEL: name: copy_s64_to_v64_unaligned + ; GFX942: liveins: $sgpr8_sgpr9 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec ; ; GFX10-LABEL: name: copy_s64_to_v64_unaligned ; GFX10: liveins: $sgpr8_sgpr9 @@ -499,13 +499,13 @@ body: | ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec ; - ; GFX940-LABEL: name: copy_s128_to_v128_unaligned - ; GFX940: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX942-LABEL: name: copy_s128_to_v128_unaligned + ; GFX942: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec ; ; GFX10-LABEL: name: copy_s128_to_v128_unaligned ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 @@ -537,12 +537,12 @@ body: | ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec ; - ; GFX940-LABEL: name: copy_v96_to_v96_unaligned - ; GFX940: liveins: $vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX942-LABEL: name: copy_v96_to_v96_unaligned + ; GFX942: liveins: $vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec ; ; GFX10-LABEL: name: copy_v96_to_v96_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10 @@ -573,12 +573,12 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec ; - ; GFX940-LABEL: name: copy_v96_unaligned_to_v96 - ; GFX940: liveins: $vgpr7_vgpr8_vgpr9 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX942-LABEL: name: copy_v96_unaligned_to_v96 + ; GFX942: liveins: $vgpr7_vgpr8_vgpr9 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec ; ; GFX10-LABEL: name: copy_v96_unaligned_to_v96 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9 @@ -609,12 +609,12 @@ body: | ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec ; - ; GFX940-LABEL: name: copy_s96_to_v96 - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-LABEL: name: copy_s96_to_v96 + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec ; ; GFX10-LABEL: name: copy_s96_to_v96 ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 @@ -645,12 +645,12 @@ body: | ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec ; - ; GFX940-LABEL: name: copy_s96_to_v96_unaligned - ; GFX940: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-LABEL: name: copy_s96_to_v96_unaligned + ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec ; ; GFX10-LABEL: name: copy_s96_to_v96_unaligned ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 diff --git llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll index b64968c9336b..c954e1fe124e 100644 --- llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -71,12 +71,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c < %s | FileCheck --check-prefixes=GFX90C %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=-xnack < %s | FileCheck --check-prefixes=GFX90C-NOXNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=+xnack < %s | FileCheck --check-prefixes=GFX90C-XNACK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefixes=GFX940 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX940-NOXNACK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX940-XNACK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 < %s | FileCheck --check-prefixes=GFX941 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX941-NOXNACK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX941-XNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX942-NOXNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX942-XNACK %s @@ -174,12 +168,6 @@ ; GFX90C: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c" ; GFX90C-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c:xnack-" ; GFX90C-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c:xnack+" -; GFX940: .amdgcn_target "amdgcn-amd-amdhsa--gfx940" -; GFX940-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940:xnack-" -; GFX940-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940:xnack+" -; GFX941: .amdgcn_target "amdgcn-amd-amdhsa--gfx941" -; GFX941-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx941:xnack-" -; GFX941-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx941:xnack+" ; GFX942: .amdgcn_target "amdgcn-amd-amdhsa--gfx942" ; GFX942-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack-" ; GFX942-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack+" diff --git llvm/test/CodeGen/AMDGPU/dpp64_combine.ll llvm/test/CodeGen/AMDGPU/dpp64_combine.ll index c7422786d344..34b794705e98 100644 --- llvm/test/CodeGen/AMDGPU/dpp64_combine.ll +++ llvm/test/CodeGen/AMDGPU/dpp64_combine.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX940 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS @@ -74,7 +74,7 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) { ; DPP64: v_mov_b32_dpp ; GFX90A: v_add_co_u32_e32 ; GFX90A: v_addc_co_u32_e32 -; GFX940: v_lshl_add_u64 +; GFX942: v_lshl_add_u64 ; GFX10PLUS: v_mov_b32_dpp ; GFX10PLUS: v_add_co_u32 ; GFX10PLUS: v_add_co_ci_u32_e32 diff --git llvm/test/CodeGen/AMDGPU/dpp64_combine.mir llvm/test/CodeGen/AMDGPU/dpp64_combine.mir index 9a6a54bbc4e4..84da231c95a6 100644 --- llvm/test/CodeGen/AMDGPU/dpp64_combine.mir +++ llvm/test/CodeGen/AMDGPU/dpp64_combine.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=GCN -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=GCN +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=GCN --- # GCN-LABEL: name: dpp64_old_impdef diff --git llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index 99344f16d4cd..65039b471694 100644 --- llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -54,8 +54,6 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx909 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx90a < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX90A %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx90c < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX90C %s -; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX940 %s -; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx941 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX941 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX942 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX950 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1010 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s @@ -137,8 +135,6 @@ ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) ; GFX90A: EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F) ; GFX90C: EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32) -; GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40) -; GFX941: EF_AMDGPU_MACH_AMDGCN_GFX941 (0x4B) ; GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C) ; GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) diff --git llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll index 3ad2a9df764b..cf69c10bcef7 100644 --- llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll +++ llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll @@ -9,8 +9,8 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx90a < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX90A %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx90a < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX90A %s -; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s -; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX942 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX942 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s @@ -42,10 +42,10 @@ ; SRAM-ECC-GFX90A: EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F) ; SRAM-ECC-GFX90A: ] -; SRAM-ECC-GFX940: Flags [ -; SRAM-ECC-GFX940: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200) -; SRAM-ECC-GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40) -; SRAM-ECC-GFX940: ] +; SRAM-ECC-GFX942: Flags [ +; SRAM-ECC-GFX942: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200) +; SRAM-ECC-GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x40) +; SRAM-ECC-GFX942: ] ; SRAM-ECC-GFX950: Flags [ ; SRAM-ECC-GFX950: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200) diff --git llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir index 001a72e36097..2f8ad7f56478 100644 --- llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir +++ llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir @@ -5,7 +5,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW32 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s diff --git llvm/test/CodeGen/AMDGPU/eliminate-frame-index-scalar-bit-ops.mir llvm/test/CodeGen/AMDGPU/eliminate-frame-index-scalar-bit-ops.mir index 1456bbc369b6..aecff1b13171 100644 --- llvm/test/CodeGen/AMDGPU/eliminate-frame-index-scalar-bit-ops.mir +++ llvm/test/CodeGen/AMDGPU/eliminate-frame-index-scalar-bit-ops.mir @@ -5,7 +5,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW32 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s diff --git llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index b5a9f0271101..12e8d24cb367 100644 --- llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -6,7 +6,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=MUBUFW64,GFX10 %s # FIXME: Test in wave32 -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=FLATSCRW64,GFX940 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=FLATSCRW64,GFX942 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=FLATSCRW64,GFX11 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=FLATSCRW64,GFX12 %s @@ -512,14 +512,14 @@ body: | ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 128, killed $vgpr0, 0, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr - ; GFX940: liveins: $sgpr8 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr + ; GFX942: liveins: $sgpr8 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr ; GFX11: liveins: $sgpr8 @@ -596,14 +596,14 @@ body: | ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 128, killed $vgpr0, 1, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp - ; GFX940: liveins: $sgpr8 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 1, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX942: liveins: $sgpr8 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 1, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp ; GFX11: liveins: $sgpr8 @@ -681,13 +681,13 @@ body: | ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 128, killed $vgpr0, 0, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr - ; GFX940: liveins: $vgpr8 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr32, $vgpr8, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; GFX942: liveins: $vgpr8 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr32, $vgpr8, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr ; GFX11: liveins: $vgpr8 @@ -765,13 +765,13 @@ body: | ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 128, killed $vgpr0, 1, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp - ; GFX940: liveins: $vgpr8 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr32, $vgpr8, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 1, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; GFX942: liveins: $vgpr8 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr32, $vgpr8, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 1, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp ; GFX11: liveins: $vgpr8 @@ -849,13 +849,13 @@ body: | ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 128, killed $vgpr0, 0, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc - ; GFX940: liveins: $vgpr8 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr32, $vgpr8, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; GFX942: liveins: $vgpr8 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr32, $vgpr8, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc ; GFX11: liveins: $vgpr8 @@ -1019,10 +1019,10 @@ body: | ; GFX10-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 84, 0, 1, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp - ; GFX940: $sgpr4 = S_MOV_B32 72 - ; GFX940-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $sgpr4, 1, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX942: $sgpr4 = S_MOV_B32 72 + ; GFX942-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $sgpr4, 1, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp ; GFX11: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 84, 0, 1, implicit $exec @@ -1109,17 +1109,17 @@ body: | ; GFX10-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required - ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec - ; GFX940-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) - ; GFX940-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $vgpr0, 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; GFX942-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required ; GFX11: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 @@ -1714,12 +1714,12 @@ body: | ; MUBUFW64-NEXT: renamable $vgpr0, dead renamable $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64__fi_sgpr_func - ; GFX940: liveins: $sgpr4 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64__fi_sgpr_func + ; GFX942: liveins: $sgpr4 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, dead renamable $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64__fi_sgpr_func ; GFX11: liveins: $sgpr4 @@ -1902,14 +1902,14 @@ body: | ; GFX10-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5) ; GFX10-NEXT: S_ENDPGM 0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register - ; GFX940: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec - ; GFX940-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register + ; GFX942: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec + ; GFX942-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register ; GFX11: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C @@ -2006,14 +2006,14 @@ body: | ; GFX10-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5) ; GFX10-NEXT: S_ENDPGM 0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register - ; GFX940: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc - ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec - ; GFX940-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register + ; GFX942: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec + ; GFX942-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register ; GFX11: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C @@ -2137,21 +2137,21 @@ body: | ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX940: liveins: $sgpr4, $sgpr5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 - ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc - ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 - ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 - ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc + ; GFX942: liveins: $sgpr4, $sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr4 = frame-setup COPY $sgpr33 + ; GFX942-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc + ; GFX942-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX942-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-setup COPY $sgpr32 + ; GFX942-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc + ; GFX942-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX942-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX942-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 + ; GFX942-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc ; GFX11: liveins: $sgpr4, $sgpr5 @@ -2290,21 +2290,21 @@ body: | ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; - ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX940: liveins: $sgpr4, $sgpr5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 - ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc - ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 - ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 - ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 - ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 + ; GFX942-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live + ; GFX942: liveins: $sgpr4, $sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr4 = frame-setup COPY $sgpr33 + ; GFX942-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc + ; GFX942-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX942-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-setup COPY $sgpr32 + ; GFX942-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc + ; GFX942-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX942-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX942-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 + ; GFX942-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 + ; GFX942-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live ; GFX11: liveins: $sgpr4, $sgpr5 @@ -2444,21 +2444,21 @@ body: | ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; - ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX940: liveins: $sgpr4, $sgpr5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 - ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc - ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 - ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 - ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; GFX942-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc + ; GFX942: liveins: $sgpr4, $sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr4 = frame-setup COPY $sgpr33 + ; GFX942-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc + ; GFX942-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX942-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-setup COPY $sgpr32 + ; GFX942-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc + ; GFX942-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX942-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX942-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 + ; GFX942-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 + ; GFX942-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc ; GFX11: liveins: $sgpr4, $sgpr5 @@ -2597,21 +2597,21 @@ body: | ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX940: liveins: $sgpr4, $sgpr5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 - ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc - ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 - ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 - ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 - ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX942-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live + ; GFX942: liveins: $sgpr4, $sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr4 = frame-setup COPY $sgpr33 + ; GFX942-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc + ; GFX942-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX942-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-setup COPY $sgpr32 + ; GFX942-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc + ; GFX942-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX942-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX942-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX942-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 + ; GFX942-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 + ; GFX942-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live ; GFX11: liveins: $sgpr4, $sgpr5 diff --git llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir index b7a5cf963138..6a4671058dc0 100644 --- llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir +++ llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUF %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUF %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW32 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s diff --git llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll index ef180cef7ed2..997432de6528 100644 --- llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic ; GFX11: bb.0 (%ir-block.0): @@ -33,18 +33,18 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic ; GFX11: bb.0 (%ir-block.0): @@ -63,17 +63,17 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data } define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) #0 { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): @@ -91,18 +91,18 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) #0 { - ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX942-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index d64becc74ddc..36714b386e7e 100644 --- llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -1,153 +1,153 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) ret void } define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) ret double %ret } define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } diff --git llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 707cae953483..2825d9048660 100644 --- llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -26,14 +26,14 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -202,14 +202,14 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -388,17 +388,17 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -582,14 +582,14 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -788,14 +788,14 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -1005,17 +1005,17 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -1228,14 +1228,14 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -1417,14 +1417,14 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -1636,14 +1636,14 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX11: ; %bb.0: @@ -1800,14 +1800,14 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1951,14 +1951,14 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -2168,14 +2168,14 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -2336,14 +2336,14 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2512,14 +2512,14 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2698,17 +2698,17 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2892,14 +2892,14 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3098,14 +3098,14 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3315,17 +3315,17 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3538,14 +3538,14 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3727,14 +3727,14 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3947,14 +3947,14 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -4136,14 +4136,14 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -4355,14 +4355,14 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -4519,14 +4519,14 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -4676,14 +4676,14 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -4840,14 +4840,14 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -4997,14 +4997,14 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -5173,14 +5173,14 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -5379,14 +5379,14 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -5527,14 +5527,14 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -5717,43 +5717,43 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_4 -; GFX940-NEXT: .LBB30_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB30_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB30_2 -; GFX940-NEXT: .LBB30_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB30_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB30_4 +; GFX942-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB30_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB30_2 +; GFX942-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6115,43 +6115,43 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_4 -; GFX940-NEXT: .LBB31_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB31_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB31_2 -; GFX940-NEXT: .LBB31_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB31_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB31_4 +; GFX942-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB31_2 +; GFX942-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6536,44 +6536,44 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_4 -; GFX940-NEXT: .LBB32_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB32_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB32_2 -; GFX940-NEXT: .LBB32_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB32_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB32_4 +; GFX942-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB32_2 +; GFX942-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6951,40 +6951,40 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_4 -; GFX940-NEXT: .LBB33_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB33_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB33_2 -; GFX940-NEXT: .LBB33_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB33_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB33_4 +; GFX942-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB33_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB33_2 +; GFX942-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7346,42 +7346,42 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_4 -; GFX940-NEXT: .LBB34_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB34_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB34_2 -; GFX940-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB34_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB34_4 +; GFX942-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB34_2 +; GFX942-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7757,43 +7757,43 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_4 -; GFX940-NEXT: .LBB35_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB35_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB35_2 -; GFX940-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB35_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB35_4 +; GFX942-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB35_2 +; GFX942-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8155,38 +8155,38 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB36_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8445,40 +8445,40 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB37_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8744,41 +8744,41 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB38_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9040,37 +9040,37 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB39_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9319,39 +9319,39 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB40_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9607,40 +9607,40 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB41_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9883,29 +9883,29 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB42_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10099,30 +10099,30 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB43_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10335,40 +10335,40 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB44_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10635,39 +10635,39 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB45_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10938,47 +10938,47 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB46_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11283,49 +11283,49 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB47_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11638,50 +11638,50 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB48_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11992,48 +11992,48 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB49_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12336,49 +12336,49 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB50_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12670,40 +12670,40 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX942-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB51_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12953,39 +12953,39 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB52_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13240,46 +13240,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB53_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13577,49 +13577,49 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB54_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13933,48 +13933,48 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB55_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14242,14 +14242,14 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14431,14 +14431,14 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14623,17 +14623,17 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14829,14 +14829,14 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15010,14 +15010,14 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15198,17 +15198,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15402,14 +15402,14 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15597,14 +15597,14 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15787,14 +15787,14 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -15976,14 +15976,14 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -16157,14 +16157,14 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -16346,14 +16346,14 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -16531,14 +16531,14 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16806,14 +16806,14 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -17084,17 +17084,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -17376,14 +17376,14 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -17643,14 +17643,14 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -17917,17 +17917,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -18207,14 +18207,14 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -18488,14 +18488,14 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -18764,14 +18764,14 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -19039,14 +19039,14 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -19306,14 +19306,14 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -19581,14 +19581,14 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 5aa9be627594..f2a69b999fe6 100644 --- llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -26,30 +26,30 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB0_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -168,30 +168,30 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB1_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -316,37 +316,37 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -479,29 +479,29 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB3_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -619,29 +619,29 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -766,35 +766,35 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -930,30 +930,30 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB6_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1081,29 +1081,29 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB7_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1230,30 +1230,30 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB8_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1422,30 +1422,30 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB9_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1568,30 +1568,30 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1710,30 +1710,30 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB11_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1858,37 +1858,37 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2021,29 +2021,29 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2161,29 +2161,29 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2308,35 +2308,35 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2472,30 +2472,30 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB16_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2623,29 +2623,29 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB17_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2822,45 +2822,45 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB18_2 -; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_4 +; GFX942-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB18_2 +; GFX942-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3220,45 +3220,45 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_4 -; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB19_2 -; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB19_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB19_4 +; GFX942-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB19_2 +; GFX942-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3633,46 +3633,46 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_4 -; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB20_2 -; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB20_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB20_4 +; GFX942-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB20_2 +; GFX942-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4041,42 +4041,42 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB21_2 -; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_4 +; GFX942-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB21_2 +; GFX942-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4431,44 +4431,44 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_4 -; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB22_2 -; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB22_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB22_4 +; GFX942-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB22_2 +; GFX942-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4837,45 +4837,45 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_4 -; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB23_2 -; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB23_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB23_4 +; GFX942-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB23_2 +; GFX942-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5236,45 +5236,45 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_4 -; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB24_2 -; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB24_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB24_4 +; GFX942-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB24_2 +; GFX942-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -5659,45 +5659,45 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_4 -; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB25_2 -; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB25_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB25_4 +; GFX942-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB25_2 +; GFX942-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -6039,40 +6039,40 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB26_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6343,42 +6343,42 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB27_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6656,43 +6656,43 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB28_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6967,39 +6967,39 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB29_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7262,41 +7262,41 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB30_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7566,42 +7566,42 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB31_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7859,32 +7859,32 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB32_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8094,31 +8094,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB33_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8339,42 +8339,42 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB34_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8654,41 +8654,41 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB35_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8970,47 +8970,47 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB36_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9316,49 +9316,49 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB37_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9672,50 +9672,50 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB38_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10024,46 +10024,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB39_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10359,48 +10359,48 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB40_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10704,49 +10704,49 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB41_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11039,40 +11039,40 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB42_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11323,39 +11323,39 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB43_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11617,49 +11617,49 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB44_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11974,48 +11974,48 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB45_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12303,31 +12303,31 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB46_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12538,31 +12538,31 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB47_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12776,38 +12776,38 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB48_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13031,30 +13031,30 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB49_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13257,30 +13257,30 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB50_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13490,36 +13490,36 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB51_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13743,31 +13743,31 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB52_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13983,30 +13983,30 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB53_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14242,47 +14242,47 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB54_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14588,47 +14588,47 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB55_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14937,54 +14937,54 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB56_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15303,46 +15303,46 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB57_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15639,46 +15639,46 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB58_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15982,52 +15982,52 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB59_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16345,47 +16345,47 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB60_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16696,46 +16696,46 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB61_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 065596776cf7..723505af6cc6 100644 --- llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -26,30 +26,30 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB0_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -168,30 +168,30 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB1_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -316,37 +316,37 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -479,29 +479,29 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB3_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -619,29 +619,29 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -766,35 +766,35 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -930,30 +930,30 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB6_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1081,29 +1081,29 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB7_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1230,30 +1230,30 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB8_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1422,30 +1422,30 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB9_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1568,30 +1568,30 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1710,30 +1710,30 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB11_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1858,37 +1858,37 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2021,29 +2021,29 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2161,29 +2161,29 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2308,35 +2308,35 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2472,30 +2472,30 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB16_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2623,29 +2623,29 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB17_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2822,45 +2822,45 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB18_2 -; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_4 +; GFX942-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB18_2 +; GFX942-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3220,45 +3220,45 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_4 -; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB19_2 -; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB19_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB19_4 +; GFX942-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB19_2 +; GFX942-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3633,46 +3633,46 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_4 -; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB20_2 -; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB20_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB20_4 +; GFX942-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB20_2 +; GFX942-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4041,42 +4041,42 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB21_2 -; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_4 +; GFX942-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB21_2 +; GFX942-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4431,44 +4431,44 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_4 -; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB22_2 -; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB22_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB22_4 +; GFX942-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB22_2 +; GFX942-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4837,45 +4837,45 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_4 -; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB23_2 -; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB23_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB23_4 +; GFX942-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB23_2 +; GFX942-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5236,45 +5236,45 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_4 -; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB24_2 -; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB24_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB24_4 +; GFX942-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB24_2 +; GFX942-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -5659,45 +5659,45 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_3 -; GFX940-NEXT: ; %bb.1: ; %Flow -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_4 -; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB25_2 -; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB25_3 +; GFX942-NEXT: ; %bb.1: ; %Flow +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB25_4 +; GFX942-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB25_2 +; GFX942-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX942-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -6039,40 +6039,40 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB26_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6343,42 +6343,42 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB27_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6656,43 +6656,43 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB28_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -6967,39 +6967,39 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB29_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7262,41 +7262,41 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB30_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7566,42 +7566,42 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB31_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7859,32 +7859,32 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB32_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8094,31 +8094,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB33_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8339,42 +8339,42 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB34_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8654,41 +8654,41 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB35_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8970,47 +8970,47 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB36_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9316,49 +9316,49 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB37_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9672,50 +9672,50 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB38_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10024,46 +10024,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB39_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10359,48 +10359,48 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB40_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10704,49 +10704,49 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB41_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11039,40 +11039,40 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB42_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11323,39 +11323,39 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB43_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11617,49 +11617,49 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB44_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11974,48 +11974,48 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB45_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12303,31 +12303,31 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB46_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12538,31 +12538,31 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB47_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12776,38 +12776,38 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB48_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13031,30 +13031,30 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB49_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13257,30 +13257,30 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB50_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13490,36 +13490,36 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB51_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13743,31 +13743,31 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB52_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13983,30 +13983,30 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB53_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14242,47 +14242,47 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB54_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14588,47 +14588,47 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB55_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14937,54 +14937,54 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB56_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15303,46 +15303,46 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB57_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15639,46 +15639,46 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB58_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15982,52 +15982,52 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB59_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16345,47 +16345,47 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB60_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16696,46 +16696,46 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB61_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index cd1a16134666..35771e49f387 100644 --- llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -43,28 +43,28 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB0_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX11: ; %bb.0: @@ -238,28 +238,28 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB1_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: @@ -437,35 +437,35 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX11: ; %bb.0: @@ -652,27 +652,27 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB3_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32: ; GFX11: ; %bb.0: @@ -837,27 +837,27 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: @@ -1029,33 +1029,33 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v5, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: ; GFX11: ; %bb.0: @@ -1242,28 +1242,28 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB6_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX11: ; %bb.0: @@ -1442,27 +1442,27 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB7_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: ; GFX11: ; %bb.0: @@ -1642,28 +1642,28 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB8_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX11: ; %bb.0: @@ -1837,28 +1837,28 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB9_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: @@ -2036,35 +2036,35 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: @@ -2251,27 +2251,27 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB11_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: ; GFX11: ; %bb.0: @@ -2436,27 +2436,27 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: @@ -2628,33 +2628,33 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v5, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: ; GFX11: ; %bb.0: @@ -2841,28 +2841,28 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: @@ -3041,27 +3041,27 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: ; GFX11: ; %bb.0: @@ -3266,52 +3266,52 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB16_4 -; GFX940-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB16_2 -; GFX940-NEXT: ; %bb.3: ; %Flow -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: .LBB16_4: ; %Flow3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB16_6 -; GFX940-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[4:5], v6, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v6, v[0:1], off sc0 sc1 -; GFX940-NEXT: .LBB16_6: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB16_4 +; GFX942-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB16_2 +; GFX942-NEXT: ; %bb.3: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: .LBB16_4: ; %Flow3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB16_6 +; GFX942-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v6, v[0:1], off sc0 sc1 +; GFX942-NEXT: .LBB16_6: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX11: ; %bb.0: @@ -3682,56 +3682,56 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 -; GFX940-NEXT: ; %bb.1: ; %Flow3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_6 -; GFX940-NEXT: .LBB17_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB17_4: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX940-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; %Flow -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB17_2 -; GFX940-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB17_3 +; GFX942-NEXT: ; %bb.1: ; %Flow3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB17_6 +; GFX942-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB17_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB17_4 +; GFX942-NEXT: ; %bb.5: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB17_2 +; GFX942-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX11: ; %bb.0: @@ -4129,57 +4129,57 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_3 -; GFX940-NEXT: ; %bb.1: ; %Flow3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_6 -; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB18_4: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX940-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB18_4 -; GFX940-NEXT: ; %bb.5: ; %Flow -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB18_2 -; GFX940-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_3 +; GFX942-NEXT: ; %bb.1: ; %Flow3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB18_6 +; GFX942-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB18_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB18_4 +; GFX942-NEXT: ; %bb.5: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB18_2 +; GFX942-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX11: ; %bb.0: @@ -4570,53 +4570,53 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_3 -; GFX940-NEXT: ; %bb.1: ; %Flow3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_6 -; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB19_4: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB19_4 -; GFX940-NEXT: ; %bb.5: ; %Flow -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB19_2 -; GFX940-NEXT: .LBB19_6: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB19_3 +; GFX942-NEXT: ; %bb.1: ; %Flow3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB19_6 +; GFX942-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB19_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB19_4 +; GFX942-NEXT: ; %bb.5: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB19_2 +; GFX942-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX11: ; %bb.0: @@ -4991,55 +4991,55 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_3 -; GFX940-NEXT: ; %bb.1: ; %Flow3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_6 -; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB20_4: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB20_4 -; GFX940-NEXT: ; %bb.5: ; %Flow -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB20_2 -; GFX940-NEXT: .LBB20_6: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB20_3 +; GFX942-NEXT: ; %bb.1: ; %Flow3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB20_6 +; GFX942-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB20_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB20_4 +; GFX942-NEXT: ; %bb.5: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB20_2 +; GFX942-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX11: ; %bb.0: @@ -5428,56 +5428,56 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_3 -; GFX940-NEXT: ; %bb.1: ; %Flow3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_6 -; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB21_4: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB21_4 -; GFX940-NEXT: ; %bb.5: ; %Flow -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB21_2 -; GFX940-NEXT: .LBB21_6: ; %atomicrmw.private -; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_3 +; GFX942-NEXT: ; %bb.1: ; %Flow3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB21_6 +; GFX942-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: .LBB21_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB21_4 +; GFX942-NEXT: ; %bb.5: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB21_2 +; GFX942-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX942-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX11: ; %bb.0: @@ -5852,38 +5852,38 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB22_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16: ; GFX11: ; %bb.0: @@ -6142,40 +6142,40 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB23_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -6441,41 +6441,41 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB24_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: ; GFX11: ; %bb.0: @@ -6737,37 +6737,37 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB25_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16: ; GFX11: ; %bb.0: @@ -7016,39 +7016,39 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB26_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -7304,40 +7304,40 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB27_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX11: ; %bb.0: @@ -7582,30 +7582,30 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_sub_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB28_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX11: ; %bb.0: @@ -7802,29 +7802,29 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_sub_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB29_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX11: ; %bb.0: @@ -8032,40 +8032,40 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: flat_load_dword v4, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB30_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -8332,39 +8332,39 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_sub_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB31_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -8635,47 +8635,47 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB32_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16: ; GFX11: ; %bb.0: @@ -8980,49 +8980,49 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB33_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -9335,50 +9335,50 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB34_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX11: ; %bb.0: @@ -9686,46 +9686,46 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB35_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16: ; GFX11: ; %bb.0: @@ -10020,48 +10020,48 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB36_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -10364,49 +10364,49 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB37_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX11: ; %bb.0: @@ -10698,40 +10698,40 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB38_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: ; GFX11: ; %bb.0: @@ -10981,39 +10981,39 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB39_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX11: ; %bb.0: @@ -11274,49 +11274,49 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB40_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -11630,48 +11630,48 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB41_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -11956,28 +11956,28 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB42_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16: ; GFX11: ; %bb.0: @@ -12176,28 +12176,28 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB43_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -12399,35 +12399,35 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB44_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: @@ -12638,27 +12638,27 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB45_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16: ; GFX11: ; %bb.0: @@ -12847,27 +12847,27 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB46_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -13063,33 +13063,33 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v5, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB47_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: ; GFX11: ; %bb.0: @@ -13300,28 +13300,28 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB48_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -13524,27 +13524,27 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB49_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: ; GFX11: ; %bb.0: @@ -13769,47 +13769,47 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB50_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16: ; GFX11: ; %bb.0: @@ -14115,47 +14115,47 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB51_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -14464,54 +14464,54 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB52_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: @@ -14830,46 +14830,46 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB53_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: @@ -15166,46 +15166,46 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB54_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -15509,52 +15509,52 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB55_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: @@ -15872,47 +15872,47 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB56_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: @@ -16223,46 +16223,46 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: s_mov_b64 s[2:3], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s4, 0x7fff +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX942-NEXT: s_mov_b32 s5, 0x7060302 +; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB57_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index ef3657433e8b..346b69c362c0 100644 --- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefixes=GFX940-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefixes=GFX940-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942-SDAG +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-SDAG @@ -12,46 +12,46 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @soff1_voff1(i32 %soff) { -; GFX940-SDAG-LABEL: soff1_voff1: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff1_voff1: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff1_voff1: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff1_voff1: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -140,48 +140,48 @@ bb: } define amdgpu_kernel void @soff1_voff2(i32 %soff) { -; GFX940-SDAG-LABEL: soff1_voff2: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 -; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff1_voff2: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff1_voff2: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff1_voff2: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -276,48 +276,48 @@ bb: } define amdgpu_kernel void @soff1_voff4(i32 %soff) { -; GFX940-SDAG-LABEL: soff1_voff4: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff1_voff4: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff1_voff4: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff1_voff4: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -412,48 +412,48 @@ bb: } define amdgpu_kernel void @soff2_voff1(i32 %soff) { -; GFX940-SDAG-LABEL: soff2_voff1: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff2_voff1: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff2_voff1: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff2_voff1: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -547,49 +547,49 @@ bb: } define amdgpu_kernel void @soff2_voff2(i32 %soff) { -; GFX940-SDAG-LABEL: soff2_voff2: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff2_voff2: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff2_voff2: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff2_voff2: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -688,49 +688,49 @@ bb: } define amdgpu_kernel void @soff2_voff4(i32 %soff) { -; GFX940-SDAG-LABEL: soff2_voff4: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff2_voff4: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff2_voff4: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff2_voff4: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 1 +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -829,48 +829,48 @@ bb: } define amdgpu_kernel void @soff4_voff1(i32 %soff) { -; GFX940-SDAG-LABEL: soff4_voff1: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff4_voff1: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff4_voff1: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff4_voff1: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -964,49 +964,49 @@ bb: } define amdgpu_kernel void @soff4_voff2(i32 %soff) { -; GFX940-SDAG-LABEL: soff4_voff2: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff4_voff2: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff4_voff2: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff4_voff2: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -1105,48 +1105,48 @@ bb: } define amdgpu_kernel void @soff4_voff4(i32 %soff) { -; GFX940-SDAG-LABEL: soff4_voff4: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v3 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff4_voff4: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; GFX942-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v3 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff4_voff4: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff4_voff4: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb @@ -1245,29 +1245,29 @@ bb: } define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { -; GFX940-SDAG-LABEL: soff1_voff1_negative: -; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: soff1_voff1_negative: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-SDAG-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX940-GISEL-LABEL: soff1_voff1_negative: -; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 -; GFX940-GISEL-NEXT: v_add3_u32 v0, s0, v0, -1 -; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: soff1_voff1_negative: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX942-GISEL-NEXT: v_add3_u32 v0, s0, v0, -1 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: soff1_voff1_negative: ; GFX11-SDAG: ; %bb.0: ; %bb diff --git llvm/test/CodeGen/AMDGPU/flat-scratch.ll llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 5415af02ef89..d44422e3ee35 100644 --- llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-PAL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX11-PAL %s @@ -104,19 +104,19 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: zero_init_kernel: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_mov_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s1, s0 -; GFX940-NEXT: s_mov_b32 s2, s0 -; GFX940-NEXT: s_mov_b32 s3, s0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: zero_init_kernel: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_kernel: ; GFX1010-PAL: ; %bb.0: @@ -297,21 +297,21 @@ define void @zero_init_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: zero_init_foo: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s1, s0 -; GFX940-NEXT: s_mov_b32 s2, s0 -; GFX940-NEXT: s_mov_b32 s3, s0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: zero_init_foo: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: zero_init_foo: ; GFX10-PAL: ; %bb.0: @@ -456,19 +456,19 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb @@ -604,17 +604,17 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_foo: ; GFX10-PAL: ; %bb.0: ; %bb @@ -744,17 +744,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vindex_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vindex_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb @@ -885,19 +885,19 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_vindex_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s32 -; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_vindex_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s32 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_foo: ; GFX10-PAL: ; %bb.0: ; %bb @@ -994,13 +994,13 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: private_ptr_foo: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: private_ptr_foo: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: private_ptr_foo: ; GFX10-PAL: ; %bb.0: @@ -1135,21 +1135,21 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:304 ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: zero_init_small_offset_kernel: -; GFX940: ; %bb.0: -; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s1, s0 -; GFX940-NEXT: s_mov_b32 s2, s0 -; GFX940-NEXT: s_mov_b32 s3, s0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: zero_init_small_offset_kernel: +; GFX942: ; %bb.0: +; GFX942-NEXT: scratch_load_dword v0, off, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: @@ -1351,23 +1351,23 @@ define void @zero_init_small_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: zero_init_small_offset_foo: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s1, s0 -; GFX940-NEXT: s_mov_b32 s2, s0 -; GFX940-NEXT: s_mov_b32 s3, s0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: zero_init_small_offset_foo: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: zero_init_small_offset_foo: ; GFX10-PAL: ; %bb.0: @@ -1542,23 +1542,23 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_small_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x100 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x100 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_small_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: scratch_load_dword v0, off, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_addk_i32 s1, 0x100 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_addk_i32 s0, 0x100 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -1759,21 +1759,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_small_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x100 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x100 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_small_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: scratch_load_dword v0, off, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_addk_i32 s1, 0x100 +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_addk_i32 s0, 0x100 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -1956,19 +1956,19 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vindex_small_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_sub_u32_e32 v0, 0x100, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vindex_small_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_sub_u32_e32 v0, 0x100, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -2145,22 +2145,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_vindex_small_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s32, 0x100 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_vindex_small_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_add_i32 s0, s32, 0x100 +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: ; GFX10-PAL: ; %bb.0: ; %bb @@ -2333,22 +2333,22 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: zero_init_large_offset_kernel: -; GFX940: ; %bb.0: -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s1, s0 -; GFX940-NEXT: s_mov_b32 s2, s0 -; GFX940-NEXT: s_mov_b32 s3, s0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: s_movk_i32 s0, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: zero_init_large_offset_kernel: +; GFX942: ; %bb.0: +; GFX942-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: s_movk_i32 s0, 0x4004 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: @@ -2568,27 +2568,27 @@ define void @zero_init_large_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: zero_init_large_offset_foo: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s1, s0 -; GFX940-NEXT: s_mov_b32 s2, s0 -; GFX940-NEXT: s_mov_b32 s3, s0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: zero_init_large_offset_foo: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, off, s32 offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: s_mov_b32 s2, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1 +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1 +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1 +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1010-PAL-LABEL: zero_init_large_offset_foo: ; GFX1010-PAL: ; %bb.0: @@ -2796,23 +2796,23 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_large_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x4004 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_large_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_addk_i32 s1, 0x4004 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_addk_i32 s0, 0x4004 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -3013,21 +3013,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_sindex_large_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_addk_i32 s1, 0x4004 -; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_addk_i32 s0, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_sindex_large_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_lshl_b32 s1, s0, 2 +; GFX942-NEXT: s_and_b32 s0, s0, 15 +; GFX942-NEXT: s_addk_i32 s1, 0x4004 +; GFX942-NEXT: v_mov_b32_e32 v0, 15 +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_addk_i32 s0, 0x4004 +; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -3211,20 +3211,20 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vindex_large_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: s_movk_i32 s0, 0x4004 -; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vindex_large_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: s_movk_i32 s0, 0x4004 +; GFX942-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_sub_u32_e32 v0, 0x4004, v0 +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -3403,23 +3403,23 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_vindex_large_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_vindex_large_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX942-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX10-PAL: ; %bb.0: ; %bb @@ -3566,18 +3566,18 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_large_imm_offset_kernel: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_mov_b32_e32 v0, 13 -; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_large_imm_offset_kernel: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0x3000 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -3738,19 +3738,19 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_large_imm_offset_foo: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 13 -; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_large_imm_offset_foo: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0x3000 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX10-PAL: ; %bb.0: ; %bb @@ -3889,20 +3889,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: store_load_vidx_sidx_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: store_load_vidx_sidx_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX10-PAL: ; %bb.0: ; %bb @@ -4022,16 +4022,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_i64_aligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_i64_aligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_i64_aligned: ; GFX10-PAL: ; %bb.0: ; %bb @@ -4133,16 +4133,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_i64_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_i64_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_i64_unaligned: ; GFX10-PAL: ; %bb.0: ; %bb @@ -4249,17 +4249,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_v3i32_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 1 -; GFX940-NEXT: v_mov_b32_e32 v3, 2 -; GFX940-NEXT: v_mov_b32_e32 v4, 3 -; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_v3i32_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 1 +; GFX942-NEXT: v_mov_b32_e32 v3, 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 3 +; GFX942-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_v3i32_unaligned: ; GFX10-PAL: ; %bb.0: ; %bb @@ -4372,18 +4372,18 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_v4i32_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 1 -; GFX940-NEXT: v_mov_b32_e32 v3, 2 -; GFX940-NEXT: v_mov_b32_e32 v4, 3 -; GFX940-NEXT: v_mov_b32_e32 v5, 4 -; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_v4i32_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 1 +; GFX942-NEXT: v_mov_b32_e32 v3, 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 3 +; GFX942-NEXT: v_mov_b32_e32 v5, 4 +; GFX942-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_v4i32_unaligned: ; GFX10-PAL: ; %bb.0: ; %bb @@ -4488,16 +4488,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_i32_negative_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_i32_negative_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -4610,16 +4610,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: store_load_i32_large_negative_unaligned: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: store_load_i32_large_negative_unaligned: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -4792,25 +4792,25 @@ define amdgpu_ps void @large_offset() { ; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: large_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, 16 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_movk_i32 s0, 0x810 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: large_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_movk_i32 s0, 0x810 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: large_offset: ; GFX1010-PAL: ; %bb.0: ; %bb @@ -4977,13 +4977,13 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_large_offset: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_add_i32 s0, s0, 0xffe8 -; GFX940-NEXT: scratch_load_dword v2, off, s0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_large_offset: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_add_i32 s0, s0, 0xffe8 +; GFX942-NEXT: scratch_load_dword v2, off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: sgpr_base_large_offset: ; GFX10-PAL: ; %bb.0: ; %entry @@ -5082,14 +5082,14 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_large_offset_split: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_and_b32 s0, s0, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x100f000 -; GFX940-NEXT: scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_large_offset_split: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_and_b32 s0, s0, -4 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x100f000 +; GFX942-NEXT: scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: sgpr_base_large_offset_split: ; GFX10-PAL: ; %bb.0: ; %entry @@ -5197,15 +5197,15 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_add_i32 s0, s0, s1 -; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-NEXT: v_add_u32_e32 v0, 0xffe8, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_add_i32 s0, s0, s1 +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-NEXT: v_add_u32_e32 v0, 0xffe8, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; GFX10-PAL: ; %bb.0: ; %bb @@ -5307,15 +5307,15 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: -; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_add_i32 s0, s0, s1 -; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-NEXT: v_add_u32_e32 v0, -16, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, 15 -; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_add_i32 s0, s0, s1 +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-NEXT: v_add_u32_e32 v0, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; GFX10-PAL: ; %bb.0: ; %bb @@ -5408,13 +5408,13 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-PAL-NEXT: s_endpgm ; -; GFX940-LABEL: sgpr_base_negative_offset: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_addk_i32 s0, 0xffe8 -; GFX940-NEXT: scratch_load_dword v2, off, s0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: sgpr_base_negative_offset: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_addk_i32 s0, 0xffe8 +; GFX942-NEXT: scratch_load_dword v2, off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v[0:1], v2, off sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: sgpr_base_negative_offset: ; GFX10-PAL: ; %bb.0: ; %entry diff --git llvm/test/CodeGen/AMDGPU/fmaximum3.ll llvm/test/CodeGen/AMDGPU/fmaximum3.ll index c26e2911ab3e..66de7d535db4 100644 --- llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s define float @v_fmaximum3_f32(float %a, float %b, float %c) { @@ -14,19 +14,19 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32: ; GFX950: ; %bb.0: @@ -49,19 +49,19 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v2, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v2, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_commute: ; GFX950: ; %bb.0: @@ -83,21 +83,21 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; -; GFX940-LABEL: s_fmaximum3_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_max_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_max_f32_e32 v1, s2, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_readfirstlane_b32 s0, v0 -; GFX940-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_fmaximum3_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_max_f32_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f32_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_fmaximum3_f32: ; GFX950: ; %bb.0: @@ -125,19 +125,19 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fabs0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, |v0|, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fabs0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, |v0|, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fabs0: ; GFX950: ; %bb.0: @@ -161,19 +161,19 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fabs1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, v0, |v1| -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fabs1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, v0, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fabs1: ; GFX950: ; %bb.0: @@ -197,19 +197,19 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fabs2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v1, v0, |v2| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fabs2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fabs2: ; GFX950: ; %bb.0: @@ -233,19 +233,19 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, |v0|, |v1| -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v1, v0, |v2| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, |v0|, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fabs_all: ; GFX950: ; %bb.0: @@ -271,19 +271,19 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, -v0, -v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v1, v0, -v2 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, -v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fneg_all: ; GFX950: ; %bb.0: @@ -309,19 +309,19 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fneg_fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, -|v0|, -|v1| -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v1, v0, -|v2| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fneg_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, -|v0|, -|v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v1, v0, -|v2| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fneg_fabs_all: ; GFX950: ; %bb.0: @@ -350,19 +350,19 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fneg0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, -v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fneg0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, -v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fneg0: ; GFX950: ; %bb.0: @@ -386,19 +386,19 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fneg1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v3, v0, -v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fneg1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v3, v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fneg1: ; GFX950: ; %bb.0: @@ -422,19 +422,19 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_fneg2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v1, v0, -v2 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_fneg2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_fneg2: ; GFX950: ; %bb.0: @@ -458,19 +458,19 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, 0x41000000, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_const0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, 0x41000000, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_const0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, 0x41000000, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_const0: ; GFX950: ; %bb.0: @@ -494,19 +494,19 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 0x41000000 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32__const2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v1, 0x41000000, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32__const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v1, 0x41000000, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32__const2: ; GFX950: ; %bb.0: @@ -530,19 +530,19 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, 4.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_inlineimm0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_inlineimm0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_inlineimm0: ; GFX950: ; %bb.0: @@ -565,19 +565,19 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32__inlineimm: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32__inlineimm: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32__inlineimm: ; GFX950: ; %bb.0: @@ -602,19 +602,19 @@ define float @v_fmaximum3_f32_const1_const2(float %a) { ; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_f32_const1_const2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v1, 0x41000000, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_max_f32_e32 v1, 0x41800000, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f32_const1_const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v1, 0x41000000, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f32_e32 v1, 0x41800000, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_f32_const1_const2: ; GFX950: ; %bb.0: @@ -640,27 +640,27 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX12-NEXT: v_maximum3_f32 v1, v5, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v2, v4, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v2, v5, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v6, v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v2, v4, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v2, v5, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f32: ; GFX950: ; %bb.0: @@ -685,27 +685,27 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f32_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v2, v0, v4 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v2, v1, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f32_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v6, v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v2, v0, v4 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v2, v1, v5 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f32_commute: ; GFX950: ; %bb.0: @@ -730,27 +730,27 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v3|, |v5| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f32__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v6, |v1|, |v3| -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| -; GFX940-NEXT: v_max_f32_e64 v3, |v0|, |v2| -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v2, v0, |v4| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_max_f32_e64 v2, v1, |v5| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f32__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v6, |v1|, |v3| +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| +; GFX942-NEXT: v_max_f32_e64 v3, |v0|, |v2| +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v2, v0, |v4| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_max_f32_e64 v2, v1, |v5| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f32__fabs_all: ; GFX950: ; %bb.0: @@ -778,27 +778,27 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v3, -v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f32__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v6, -v1, -v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 -; GFX940-NEXT: v_max_f32_e64 v3, -v0, -v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v2, v0, -v4 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_max_f32_e64 v2, v1, -v5 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f32__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v6, -v1, -v3 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 +; GFX942-NEXT: v_max_f32_e64 v3, -v0, -v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v2, v0, -v4 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_max_f32_e64 v2, v1, -v5 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f32__fneg_all: ; GFX950: ; %bb.0: @@ -826,27 +826,27 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f32__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, 2.0, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, 2.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f32__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v4, 2.0, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_max_f32_e32 v4, 2.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f32__inlineimm1: ; GFX950: ; %bb.0: @@ -871,27 +871,27 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f32__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f32__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v4, v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f32__inlineimm2: ; GFX950: ; %bb.0: @@ -917,35 +917,35 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX12-NEXT: v_maximum3_f32 v2, v8, v2, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v9, v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: v_max_f32_e32 v5, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v6, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v7, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v8, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v9, v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v6, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v7, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v8, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f32: ; GFX950: ; %bb.0: @@ -972,35 +972,35 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f32_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v9, v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: v_max_f32_e32 v5, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v0, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v1, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v2, v8 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f32_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v9, v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v0, v6 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v1, v7 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v2, v8 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f32_commute: ; GFX950: ; %bb.0: @@ -1027,35 +1027,35 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX12-NEXT: v_maximum3_f32 v2, |v2|, |v5|, |v8| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f32__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v9, |v2|, |v5| -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| -; GFX940-NEXT: v_max_f32_e64 v5, |v1|, |v4| -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| -; GFX940-NEXT: v_max_f32_e64 v4, |v0|, |v3| -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_max_f32_e64 v3, v0, |v6| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v3, v1, |v7| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v3, v2, |v8| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f32__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v9, |v2|, |v5| +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| +; GFX942-NEXT: v_max_f32_e64 v5, |v1|, |v4| +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| +; GFX942-NEXT: v_max_f32_e64 v4, |v0|, |v3| +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_max_f32_e64 v3, v0, |v6| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v3, v1, |v7| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v3, v2, |v8| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f32__fabs_all: ; GFX950: ; %bb.0: @@ -1085,35 +1085,35 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX12-NEXT: v_maximum3_f32 v2, -v2, -v5, -v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f32__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e64 v9, -v2, -v5 -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 -; GFX940-NEXT: v_max_f32_e64 v5, -v1, -v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 -; GFX940-NEXT: v_max_f32_e64 v4, -v0, -v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_max_f32_e64 v3, v0, -v6 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v3, v1, -v7 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_max_f32_e64 v3, v2, -v8 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f32__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e64 v9, -v2, -v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 +; GFX942-NEXT: v_max_f32_e64 v5, -v1, -v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 +; GFX942-NEXT: v_max_f32_e64 v4, -v0, -v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_max_f32_e64 v3, v0, -v6 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v3, v1, -v7 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_max_f32_e64 v3, v2, -v8 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f32__fneg_all: ; GFX950: ; %bb.0: @@ -1143,35 +1143,35 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX12-NEXT: v_maximum3_f32 v2, v2, 2.0, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f32__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, 2.0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc -; GFX940-NEXT: v_max_f32_e32 v6, 2.0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_max_f32_e32 v6, 2.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f32__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v6, 2.0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX942-NEXT: v_max_f32_e32 v6, 2.0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_max_f32_e32 v6, 2.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f32__inlineimm1: ; GFX950: ; %bb.0: @@ -1198,35 +1198,35 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f32__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: v_max_f32_e32 v5, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v3, 4.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, 4.0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, 4.0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f32__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v6, v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f32__inlineimm2: ; GFX950: ; %bb.0: @@ -1774,30 +1774,30 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v2, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16: ; GFX950: ; %bb.0: @@ -1824,30 +1824,30 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f16_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16_commute: ; GFX950: ; %bb.0: @@ -1877,32 +1877,32 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f16__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX940-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX940-NEXT: v_pk_max_f16 v3, v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v4, v0, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f16__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 +; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16__fabs_all: ; GFX950: ; %bb.0: @@ -1935,30 +1935,30 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f16__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f16__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16__fneg_all: ; GFX950: ; %bb.0: @@ -1988,30 +1988,30 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f16__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v3, v0, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f16__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1: ; GFX950: ; %bb.0: @@ -2038,30 +2038,30 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v2f16__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v2f16__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v4, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm2: ; GFX950: ; %bb.0: @@ -2090,42 +2090,42 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v5, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v4, v2 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v5, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16: ; GFX950: ; %bb.0: @@ -2155,42 +2155,42 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f16_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16_commute: ; GFX950: ; %bb.0: @@ -2227,46 +2227,46 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f16__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 -; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 -; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 -; GFX940-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v6, v6, v8 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX940-NEXT: v_perm_b32 v2, v8, v0, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX940-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f16__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 +; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 +; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 +; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all: ; GFX950: ; %bb.0: @@ -2305,42 +2305,42 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f16__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f16__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__fneg_all: ; GFX950: ; %bb.0: @@ -2373,39 +2373,39 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f16__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v7, v1, 2.0 -; GFX940-NEXT: s_mov_b32 s1, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX940-NEXT: s_movk_i32 s0, 0x7e00 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX940-NEXT: v_perm_b32 v4, v5, v0, s1 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX940-NEXT: v_pack_b32_f16 v7, v1, s0 -; GFX940-NEXT: v_pk_max_f16 v7, v7, v3 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f16__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_movk_i32 s0, 0x7e00 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_perm_b32 v4, v5, v0, s1 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1: ; GFX950: ; %bb.0: @@ -2435,42 +2435,42 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v3f16__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v4, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, 4.0 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v3f16__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 +; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm2: ; GFX950: ; %bb.0: @@ -2500,48 +2500,48 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v4f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v5, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v4, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v4f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v5, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16: ; GFX950: ; %bb.0: @@ -2571,48 +2571,48 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v4f16_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v5 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v4f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16_commute: ; GFX950: ; %bb.0: @@ -2649,52 +2649,52 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v4f16__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 -; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 -; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 -; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 -; GFX940-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v6, v6, v8 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX940-NEXT: v_perm_b32 v2, v8, v1, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX940-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX940-NEXT: v_perm_b32 v1, v3, v1, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX940-NEXT: v_perm_b32 v0, v7, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v4f16__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 +; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 +; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 +; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; GFX942-NEXT: v_pk_max_f16 v7, v7, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 +; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fabs_all: ; GFX950: ; %bb.0: @@ -2733,48 +2733,48 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v4f16__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v4f16__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all: ; GFX950: ; %bb.0: @@ -2807,46 +2807,46 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v4f16__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX940-NEXT: v_perm_b32 v4, v8, v1, s0 -; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX940-NEXT: v_pk_max_f16 v8, v8, v2 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v7, v1, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v4f16__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v8, v5, v0, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: v_pk_max_f16 v8, v8, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: ; GFX950: ; %bb.0: @@ -2876,48 +2876,48 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fmaximum3_v4f16__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v4, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_v4f16__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm2: ; GFX950: ; %bb.0: @@ -3538,19 +3538,19 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c) ; GFX12-NEXT: v_maximum_f32 v1, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_no_fmaximum3_f32__multi_use: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fmaximum3_f32__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_no_fmaximum3_f32__multi_use: ; GFX950: ; %bb.0: @@ -3573,22 +3573,22 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float ; GFX12-NEXT: s_maximum_f32 s1, s0, s2 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX940-LABEL: s_no_fmaximum3_f32__multi_use: -; GFX940: ; %bb.0: -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_max_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_max_f32_e32 v1, s2, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX940-NEXT: v_readfirstlane_b32 s0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_readfirstlane_b32 s1, v1 -; GFX940-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_no_fmaximum3_f32__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_max_f32_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f32_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_no_fmaximum3_f32__multi_use: ; GFX950: ; %bb.0: @@ -3697,30 +3697,30 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_no_fmaximum3_v2f16__multi_use: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v1, v5, s0 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fmaximum3_v2f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 +; GFX942-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_no_fmaximum3_v2f16__multi_use: ; GFX950: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/fminimum3.ll llvm/test/CodeGen/AMDGPU/fminimum3.ll index 234a07849a91..56e0b2c2f06c 100644 --- llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s define float @v_fminimum3_f32(float %a, float %b, float %c) { @@ -14,19 +14,19 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32: ; GFX950: ; %bb.0: @@ -49,19 +49,19 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v2, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v2, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v2, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_commute: ; GFX950: ; %bb.0: @@ -83,21 +83,21 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; -; GFX940-LABEL: s_fminimum3_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_min_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_min_f32_e32 v1, s2, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_readfirstlane_b32 s0, v0 -; GFX940-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_fminimum3_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_min_f32_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f32_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_fminimum3_f32: ; GFX950: ; %bb.0: @@ -125,19 +125,19 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, |v0|, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fabs0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, |v0|, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fabs0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, |v0|, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fabs0: ; GFX950: ; %bb.0: @@ -161,19 +161,19 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, |v1|, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fabs1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, v0, |v1| -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fabs1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, v0, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fabs1: ; GFX950: ; %bb.0: @@ -197,19 +197,19 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fabs2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v1, v0, |v2| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fabs2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fabs2: ; GFX950: ; %bb.0: @@ -233,19 +233,19 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v1|, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, |v0|, |v1| -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v1, v0, |v2| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, |v0|, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fabs_all: ; GFX950: ; %bb.0: @@ -271,19 +271,19 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, -v0, -v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v1, v0, -v2 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, -v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fneg_all: ; GFX950: ; %bb.0: @@ -309,19 +309,19 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, -|v0|, -|v1|, -|v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fneg_fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, -|v0|, -|v1| -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v1, v0, -|v2| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fneg_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, -|v0|, -|v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v1, v0, -|v2| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fneg_fabs_all: ; GFX950: ; %bb.0: @@ -350,19 +350,19 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, -v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fneg0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, -v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fneg0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, -v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fneg0: ; GFX950: ; %bb.0: @@ -386,19 +386,19 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, -v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fneg1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v3, v0, -v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fneg1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v3, v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fneg1: ; GFX950: ; %bb.0: @@ -422,19 +422,19 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_fneg2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v1, v0, -v2 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_fneg2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_fneg2: ; GFX950: ; %bb.0: @@ -458,19 +458,19 @@ define float @v_fminimum3_f32_const0(float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, 0x41000000, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_const0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, 0x41000000, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_const0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v2, 0x41000000, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_const0: ; GFX950: ; %bb.0: @@ -494,19 +494,19 @@ define float @v_fminimum3_f32__const2(float %a, float %b) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 0x41000000 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32__const2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v1, 0x41000000, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32__const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v1, 0x41000000, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32__const2: ; GFX950: ; %bb.0: @@ -530,19 +530,19 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, 4.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_inlineimm0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_inlineimm0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_inlineimm0: ; GFX950: ; %bb.0: @@ -565,19 +565,19 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32__inlineimm: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32__inlineimm: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32__inlineimm: ; GFX950: ; %bb.0: @@ -602,19 +602,19 @@ define float @v_fminimum3_f32_const1_const2(float %a) { ; GFX12-NEXT: v_minimum3_f32 v0, v0, s0, 0x41800000 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_f32_const1_const2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v1, 0x41000000, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_min_f32_e32 v1, 0x41800000, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f32_const1_const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v1, 0x41000000, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f32_e32 v1, 0x41800000, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_f32_const1_const2: ; GFX950: ; %bb.0: @@ -640,27 +640,27 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX12-NEXT: v_minimum3_f32 v1, v5, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v2, v4, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v2, v5, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v6, v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v2, v4, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v2, v5, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f32: ; GFX950: ; %bb.0: @@ -685,27 +685,27 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f32_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v2, v0, v4 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v2, v1, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f32_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v6, v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v2, v0, v4 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v2, v1, v5 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f32_commute: ; GFX950: ; %bb.0: @@ -730,27 +730,27 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v3|, |v5| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f32__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v6, |v1|, |v3| -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| -; GFX940-NEXT: v_min_f32_e64 v3, |v0|, |v2| -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v2, v0, |v4| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_min_f32_e64 v2, v1, |v5| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f32__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v6, |v1|, |v3| +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| +; GFX942-NEXT: v_min_f32_e64 v3, |v0|, |v2| +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v2, v0, |v4| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_min_f32_e64 v2, v1, |v5| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f32__fabs_all: ; GFX950: ; %bb.0: @@ -778,27 +778,27 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v3, -v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f32__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v6, -v1, -v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 -; GFX940-NEXT: v_min_f32_e64 v3, -v0, -v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v2, v0, -v4 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_min_f32_e64 v2, v1, -v5 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f32__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v6, -v1, -v3 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 +; GFX942-NEXT: v_min_f32_e64 v3, -v0, -v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v2, v0, -v4 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_min_f32_e64 v2, v1, -v5 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f32__fneg_all: ; GFX950: ; %bb.0: @@ -826,27 +826,27 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f32__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v4, 2.0, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, 2.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f32__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v4, 2.0, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_min_f32_e32 v4, 2.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f32__inlineimm1: ; GFX950: ; %bb.0: @@ -871,27 +871,27 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f32__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f32__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v4, v1, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX942-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f32__inlineimm2: ; GFX950: ; %bb.0: @@ -917,35 +917,35 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX12-NEXT: v_minimum3_f32 v2, v8, v2, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v9, v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: v_min_f32_e32 v5, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v6, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v7, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v8, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v9, v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v6, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v7, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v8, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f32: ; GFX950: ; %bb.0: @@ -972,35 +972,35 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f32_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v9, v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: v_min_f32_e32 v5, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v0, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v1, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v2, v8 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f32_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v9, v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v0, v6 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v1, v7 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v2, v8 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f32_commute: ; GFX950: ; %bb.0: @@ -1027,35 +1027,35 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX12-NEXT: v_minimum3_f32 v2, |v2|, |v5|, |v8| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f32__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v9, |v2|, |v5| -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| -; GFX940-NEXT: v_min_f32_e64 v5, |v1|, |v4| -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| -; GFX940-NEXT: v_min_f32_e64 v4, |v0|, |v3| -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_min_f32_e64 v3, v0, |v6| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v3, v1, |v7| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v3, v2, |v8| -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f32__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v9, |v2|, |v5| +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| +; GFX942-NEXT: v_min_f32_e64 v5, |v1|, |v4| +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| +; GFX942-NEXT: v_min_f32_e64 v4, |v0|, |v3| +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_min_f32_e64 v3, v0, |v6| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v3, v1, |v7| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v3, v2, |v8| +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f32__fabs_all: ; GFX950: ; %bb.0: @@ -1085,35 +1085,35 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX12-NEXT: v_minimum3_f32 v2, -v2, -v5, -v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f32__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e64 v9, -v2, -v5 -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 -; GFX940-NEXT: v_min_f32_e64 v5, -v1, -v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 -; GFX940-NEXT: v_min_f32_e64 v4, -v0, -v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc -; GFX940-NEXT: v_min_f32_e64 v3, v0, -v6 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v3, v1, -v7 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc -; GFX940-NEXT: v_min_f32_e64 v3, v2, -v8 -; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f32__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e64 v9, -v2, -v5 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 +; GFX942-NEXT: v_min_f32_e64 v5, -v1, -v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 +; GFX942-NEXT: v_min_f32_e64 v4, -v0, -v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX942-NEXT: v_min_f32_e64 v3, v0, -v6 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v3, v1, -v7 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX942-NEXT: v_min_f32_e64 v3, v2, -v8 +; GFX942-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f32__fneg_all: ; GFX950: ; %bb.0: @@ -1143,35 +1143,35 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX12-NEXT: v_minimum3_f32 v2, v2, 2.0, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f32__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, 2.0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc -; GFX940-NEXT: v_min_f32_e32 v6, 2.0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX940-NEXT: v_min_f32_e32 v6, 2.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f32__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v6, 2.0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX942-NEXT: v_min_f32_e32 v6, 2.0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX942-NEXT: v_min_f32_e32 v6, 2.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f32__inlineimm1: ; GFX950: ; %bb.0: @@ -1198,35 +1198,35 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f32__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: v_min_f32_e32 v5, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v3, 4.0, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, 4.0, v1 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, 4.0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f32__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v6, v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX942-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v1 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f32__inlineimm2: ; GFX950: ; %bb.0: @@ -1774,30 +1774,30 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX12-NEXT: v_pk_minimum_f16 v0, v2, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v2, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16: ; GFX950: ; %bb.0: @@ -1824,30 +1824,30 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f16_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16_commute: ; GFX950: ; %bb.0: @@ -1877,32 +1877,32 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f16__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX940-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX940-NEXT: v_pk_min_f16 v3, v3, v4 -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v4, v0, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f16__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 +; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16__fabs_all: ; GFX950: ; %bb.0: @@ -1935,30 +1935,30 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f16__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f16__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16__fneg_all: ; GFX950: ; %bb.0: @@ -1988,30 +1988,30 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f16__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v3, v0, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f16__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1: ; GFX950: ; %bb.0: @@ -2038,30 +2038,30 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v2f16__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v2f16__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v0, v4, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm2: ; GFX950: ; %bb.0: @@ -2090,42 +2090,42 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v5, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v4, v2 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v5, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16: ; GFX950: ; %bb.0: @@ -2155,42 +2155,42 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f16_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16_commute: ; GFX950: ; %bb.0: @@ -2227,46 +2227,46 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f16__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 -; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 -; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 -; GFX940-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v6, v6, v8 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX940-NEXT: v_perm_b32 v2, v8, v0, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX940-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f16__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 +; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 +; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 +; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__fabs_all: ; GFX950: ; %bb.0: @@ -2305,42 +2305,42 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f16__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f16__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__fneg_all: ; GFX950: ; %bb.0: @@ -2373,39 +2373,39 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f16__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v7, v1, 2.0 -; GFX940-NEXT: s_mov_b32 s1, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX940-NEXT: s_movk_i32 s0, 0x7e00 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX940-NEXT: v_perm_b32 v4, v5, v0, s1 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX940-NEXT: v_pack_b32_f16 v7, v1, s0 -; GFX940-NEXT: v_pk_min_f16 v7, v7, v3 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f16__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_movk_i32 s0, 0x7e00 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_perm_b32 v4, v5, v0, s1 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1: ; GFX950: ; %bb.0: @@ -2435,42 +2435,42 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v3f16__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v4, s0 -; GFX940-NEXT: v_pk_min_f16 v1, v1, 4.0 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v3f16__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0 +; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm2: ; GFX950: ; %bb.0: @@ -2500,48 +2500,48 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v4f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v5, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v4, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v4f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v5, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16: ; GFX950: ; %bb.0: @@ -2571,48 +2571,48 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v4f16_commute: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v5 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v4f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16_commute: ; GFX950: ; %bb.0: @@ -2649,52 +2649,52 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v4f16__fabs_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 -; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 -; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 -; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 -; GFX940-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v6, v6, v8 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX940-NEXT: v_perm_b32 v2, v8, v1, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX940-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX940-NEXT: v_perm_b32 v1, v3, v1, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX940-NEXT: v_perm_b32 v0, v7, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v4f16__fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 +; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 +; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 +; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; GFX942-NEXT: v_pk_min_f16 v7, v7, v9 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 +; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fabs_all: ; GFX950: ; %bb.0: @@ -2733,48 +2733,48 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v4f16__fneg_all: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v4f16__fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__fneg_all: ; GFX950: ; %bb.0: @@ -2807,46 +2807,46 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v4f16__inlineimm1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX940-NEXT: v_perm_b32 v4, v8, v1, s0 -; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX940-NEXT: v_pk_min_f16 v8, v8, v2 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v7, v1, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v4f16__inlineimm1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v8, v5, v0, s0 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX942-NEXT: v_pk_min_f16 v8, v8, v2 +; GFX942-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GFX942-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: ; GFX950: ; %bb.0: @@ -2876,48 +2876,48 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_fminimum3_v4f16__inlineimm2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v1, v4, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_v4f16__inlineimm2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm2: ; GFX950: ; %bb.0: @@ -3538,19 +3538,19 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c) ; GFX12-NEXT: v_minimum_f32 v1, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_no_fminimum3_f32__multi_use: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fminimum3_f32__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_no_fminimum3_f32__multi_use: ; GFX950: ; %bb.0: @@ -3573,22 +3573,22 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float ; GFX12-NEXT: s_minimum_f32 s1, s0, s2 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX940-LABEL: s_no_fminimum3_f32__multi_use: -; GFX940: ; %bb.0: -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_min_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_min_f32_e32 v1, s2, v0 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX940-NEXT: v_readfirstlane_b32 s0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_readfirstlane_b32 s1, v1 -; GFX940-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_no_fminimum3_f32__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_min_f32_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f32_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: ; return to shader part epilog ; ; GFX950-LABEL: s_no_fminimum3_f32__multi_use: ; GFX950: ; %bb.0: @@ -3697,30 +3697,30 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX12-NEXT: v_pk_minimum_f16 v1, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_no_fminimum3_v2f16__multi_use: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v1, v5, s0 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX940-NEXT: v_perm_b32 v1, v1, v5, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fminimum3_v2f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 +; GFX942-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v1, v5, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_no_fminimum3_v2f16__multi_use: ; GFX950: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir index e94546fd5e8a..a9b3eaf4c33a 100644 --- llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir +++ llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck %s --check-prefixes=GFX908 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck %s --check-prefixes=GFX90A -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck %s --check-prefixes=GFX90A +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck %s --check-prefixes=GFX90A --- name: test_sgpr_init_multiuse diff --git llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir index baaca76bfd8a..b3658080aae0 100644 --- llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir +++ llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-clear-kill-flags.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s --- name: fold_zero_high_bits_src1_alive diff --git llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll similarity index 65% rename from llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll rename to llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 4216bdf409ed..d683bf4f778b 100644 --- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -1,22 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX942 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12 declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { -; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: @@ -34,17 +34,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { } define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { -; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: @@ -61,15 +61,15 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ret void } define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX12: ; %bb.0: @@ -90,15 +90,15 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { } define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_v2f16_noret: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: ds_pk_add_f16 v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: @@ -114,12 +114,12 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, } define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: local_atomic_fadd_v2f16_rtn: ; GFX12: ; %bb.0: @@ -138,15 +138,15 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> } define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_bf16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: ds_pk_add_bf16 v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: @@ -162,12 +162,12 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, } define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX12: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 4aec2ffead43..694994e0a82b 100644 --- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX942 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) @@ -28,16 +28,16 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -51,12 +51,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -79,20 +79,20 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -111,16 +111,16 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -134,12 +134,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -162,20 +162,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -194,16 +194,16 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -217,12 +217,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -245,20 +245,20 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -277,16 +277,16 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -300,12 +300,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -328,20 +328,20 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -360,16 +360,16 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -383,12 +383,12 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -411,20 +411,20 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -443,16 +443,16 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -466,12 +466,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -494,20 +494,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -526,16 +526,16 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -549,12 +549,12 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -577,20 +577,20 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -609,16 +609,16 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -632,12 +632,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -660,20 +660,20 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -692,16 +692,16 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -715,12 +715,12 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -743,20 +743,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -775,16 +775,16 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -798,12 +798,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -826,20 +826,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -858,16 +858,16 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -881,12 +881,12 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -909,20 +909,20 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -941,16 +941,16 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -964,12 +964,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -992,20 +992,20 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -1027,17 +1027,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1056,17 +1056,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1087,17 +1087,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1116,17 +1116,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_flush: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1145,15 +1145,15 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1170,15 +1170,15 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1197,15 +1197,15 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1235,17 +1235,17 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -1266,17 +1266,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1295,17 +1295,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1326,17 +1326,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1355,15 +1355,15 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1380,15 +1380,15 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1408,16 +1408,16 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat_system: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1447,17 +1447,17 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1 ret void @@ -1475,16 +1475,16 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret void @@ -1500,14 +1500,14 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_f64_rtn: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret @@ -1525,15 +1525,15 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1551,15 +1551,15 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret_pat_flush: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -1577,15 +1577,15 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: ds_add_f64 v2, v[0:1] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1601,13 +1601,13 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_pat: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_f64_rtn_pat: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %ret @@ -1623,14 +1623,14 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret @@ -1646,14 +1646,14 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret diff --git llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir llvm/test/CodeGen/AMDGPU/gfx942-hazards.mir similarity index 99% rename from llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir rename to llvm/test/CodeGen/AMDGPU/gfx942-hazards.mir index 348beb7bcf3c..2c760baf8a5e 100644 --- llvm/test/CodeGen/AMDGPU/gfx940-hazards.mir +++ llvm/test/CodeGen/AMDGPU/gfx942-hazards.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: trans32_write_non_trans32_read # GCN: V_RCP_F32 diff --git llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index 8a7762fb4b6c..8eb6cecd6ba9 100644 --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s @@ -18,17 +18,17 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -96,67 +96,67 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5): - ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] - ; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.31): - ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.3.Flow: - ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.33): - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX942-NEXT: S_BRANCH %bb.1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: bb.1 (%ir-block.5): + ; GFX90A_GFX942-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_GFX942-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A_GFX942-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX90A_GFX942-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec + ; GFX90A_GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX942-NEXT: S_BRANCH %bb.2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: bb.2 (%ir-block.31): + ; GFX90A_GFX942-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: bb.3.Flow: + ; GFX90A_GFX942-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: bb.4 (%ir-block.33): + ; GFX90A_GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 ; ; GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX11_GFX12: bb.0 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 1fb34abb41a2..26e0ec9892c4 100644 --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -1,22 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): @@ -117,87 +117,87 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: $vgpr0 = COPY [[PHI]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; - ; GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.1 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.1 (%ir-block.5): - ; GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 - ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 - ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec - ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] - ; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.2 (%ir-block.32): - ; GFX940-NEXT: successors: %bb.4(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: S_BRANCH %bb.4 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.3.Flow: - ; GFX940-NEXT: successors: %bb.5(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 - ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.5 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.4 (%ir-block.35): - ; GFX940-NEXT: successors: %bb.3(0x80000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 - ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX940-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.5 (%ir-block.41): - ; GFX940-NEXT: $vgpr0 = COPY [[PHI]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX942-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.1 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.1 (%ir-block.5): + ; GFX942-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX942-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX942-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec + ; GFX942-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX942-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX942-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX942-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec + ; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.2 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.32): + ; GFX942-NEXT: successors: %bb.4(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: S_BRANCH %bb.4 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.3.Flow: + ; GFX942-NEXT: successors: %bb.5(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.4 (%ir-block.35): + ; GFX942-NEXT: successors: %bb.3(0x80000000) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX942-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] + ; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX942-NEXT: S_BRANCH %bb.3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.5 (%ir-block.41): + ; GFX942-NEXT: $vgpr0 = COPY [[PHI]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): diff --git llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index ba94a53dff03..fa4e7f87853d 100644 --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -1,22 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -67,66 +67,66 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: $sgpr1 = COPY [[COPY13]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 ; - ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX942-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX942-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX942-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret double %ret } diff --git llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll index 02e425e6d10a..82cec179e72d 100644 --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { @@ -20,17 +20,17 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -48,17 +48,17 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) in ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -76,17 +76,17 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat(ptr addrspace(1) %pt ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -104,17 +104,17 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat(ptr addrspace( ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } diff --git llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll index 794a52b6900e..b9e833a7105d 100644 --- llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll +++ llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll @@ -1,73 +1,73 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_rtn_flat + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; GFX90A_GFX942-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat + ; GFX90A_GFX942: bb.0 (%ir-block.0): + ; GFX90A_GFX942-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A_GFX942-NEXT: {{ $}} + ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] + ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } diff --git llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 73b4428b03c8..904ef8a4b657 100644 --- llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -27,14 +27,14 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -211,14 +211,14 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -397,14 +397,14 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -593,14 +593,14 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -771,14 +771,14 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -952,14 +952,14 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1142,14 +1142,14 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1331,14 +1331,14 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1514,14 +1514,14 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX11: ; %bb.0: @@ -1716,14 +1716,14 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -1918,14 +1918,14 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -2113,14 +2113,14 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2299,14 +2299,14 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -2471,14 +2471,14 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -2673,14 +2673,14 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX11: ; %bb.0: @@ -2868,14 +2868,14 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3049,14 +3049,14 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -3204,14 +3204,14 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -3399,14 +3399,14 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -3599,14 +3599,14 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -3791,14 +3791,14 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -3991,14 +3991,14 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -4183,14 +4183,14 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4367,14 +4367,14 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4549,14 +4549,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4719,14 +4719,14 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -4891,14 +4891,14 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5073,14 +5073,14 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5225,14 +5225,14 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5380,14 +5380,14 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5544,14 +5544,14 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5719,14 +5719,14 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -5876,14 +5876,14 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -6048,14 +6048,14 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: @@ -6203,14 +6203,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -6403,14 +6403,14 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -6595,14 +6595,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -6765,14 +6765,14 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -6938,14 +6938,14 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7159,14 +7159,14 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7381,14 +7381,14 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7609,14 +7609,14 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -7812,14 +7812,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8018,14 +8018,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8252,38 +8252,38 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: global_load_dword v4, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB44_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8592,40 +8592,40 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: global_load_dword v4, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB45_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -8943,41 +8943,41 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: global_load_dword v4, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB46_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9291,37 +9291,37 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB47_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9619,39 +9619,39 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB48_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -9957,40 +9957,40 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB49_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10285,30 +10285,30 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB50_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10544,29 +10544,29 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB51_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -10811,40 +10811,40 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: global_load_dword v4, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB52_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11163,39 +11163,39 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB53_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11516,47 +11516,47 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB54_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -11911,49 +11911,49 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB55_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12318,50 +12318,50 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB56_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -12721,46 +12721,46 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB57_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13104,48 +13104,48 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB58_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB58_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13498,49 +13498,49 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB59_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0xf800 +; GFX942-NEXT: s_mov_b32 s1, -1 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB59_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -13882,40 +13882,40 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB60_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX942-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB60_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14204,39 +14204,39 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB61_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB61_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14534,49 +14534,49 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB62_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB62_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -14942,48 +14942,48 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB63_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX942-NEXT: s_movk_i32 s2, 0x7fff +; GFX942-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB63_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15301,14 +15301,14 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15533,14 +15533,14 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -15767,14 +15767,14 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16005,14 +16005,14 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16215,14 +16215,14 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16428,14 +16428,14 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16650,14 +16650,14 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -16887,14 +16887,14 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -17102,14 +17102,14 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -17348,14 +17348,14 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -17584,14 +17584,14 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -17816,14 +17816,14 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -18026,14 +18026,14 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: ; GFX11: ; %bb.0: @@ -18272,14 +18272,14 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: ; GFX11: ; %bb.0: @@ -18512,14 +18512,14 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -18840,14 +18840,14 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -19170,14 +19170,14 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -19504,14 +19504,14 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -19822,14 +19822,14 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -20143,14 +20143,14 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -20473,14 +20473,14 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -20806,14 +20806,14 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -21129,14 +21129,14 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -21457,14 +21457,14 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -21775,14 +21775,14 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -22103,14 +22103,14 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -22421,14 +22421,14 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: @@ -22749,14 +22749,14 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: @@ -23079,25 +23079,25 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: .LBB92_2: ; GFX12-NEXT: s_endpgm ; -; GFX940-LABEL: infer_as_before_atomic: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX940-NEXT: s_cbranch_execz .LBB92_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX940-NEXT: .LBB92_2: -; GFX940-NEXT: s_endpgm +; GFX942-LABEL: infer_as_before_atomic: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB92_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX942-NEXT: .LBB92_2: +; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: diff --git llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index cd6ed1e6b98c..e8d73914ad30 100644 --- llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s @@ -27,30 +27,30 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB0_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -188,30 +188,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB1_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -351,30 +351,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB2_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -514,29 +514,29 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB3_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -669,29 +669,29 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB4_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -827,29 +827,29 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -986,30 +986,30 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB6_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1152,29 +1152,29 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB7_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1312,30 +1312,30 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB8_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1542,30 +1542,30 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB9_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: @@ -1707,30 +1707,30 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB10_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -1868,30 +1868,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB11_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2031,30 +2031,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2194,29 +2194,29 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB13_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2349,29 +2349,29 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB14_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2507,29 +2507,29 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB15_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2666,30 +2666,30 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB16_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -2832,29 +2832,29 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: buffer_wbl2 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB17_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3015,14 +3015,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3187,14 +3187,14 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: @@ -3360,14 +3360,14 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) ########## TRUNCATED ###########