LLVM / project / 834cc88 / [X86] X86FixupVectorConstantsPass - attempt to replace full width fp vector constant loads with broadcasts on AVX+ targets (REAPP

LLVM/project 834cc88 — llvm/lib/Target/X86 X86FixupVectorConstants.cpp, llvm/test/CodeGen/X86 recip-fastmath.ll vector-interleaved-load-i32-stride-6.ll

Jun 13, 2023 by Simon Pilgrim on ⎇

main

[X86] X86FixupVectorConstantsPass - attempt to replace full width fp vector constant loads with broadcasts on AVX+ targets (REAPPLIED)

lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space.

NOTE: SSE3 targets can use MOVDDUP but not all SSE era CPUs can perform this as cheaply as a vector load, we will need to add scheduler model checks if we want to pursue this.

This is an updated commit of 98061013e01207444cfd3980cde17b5e75764fbe after being reverted at a279a09ab9524d1d74ef29b34618102d4b202e2f

Delta		File
+41	-125	llvm/test/CodeGen/X86/recip-fastmath.ll
+69	-35	llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+27	-55	llvm/test/CodeGen/X86/extractelement-load.ll
+31	-43	llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+38	-32	llvm/test/CodeGen/X86/vector-trunc-math.ll
+42	-21	llvm/test/CodeGen/X86/vector-trunc-usat.ll
+30	-30	llvm/test/CodeGen/X86/fma_patterns_wide.ll
+40	-20	llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+28	-28	llvm/test/CodeGen/X86/v8i1-masks.ll
+32	-24	llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+20	-33	llvm/test/CodeGen/X86/sqrt-fastmath.ll
+17	-32	llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+24	-24	llvm/test/CodeGen/X86/recip-fastmath2.ll
+45	-0	llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+22	-20	llvm/test/CodeGen/X86/fma_patterns.ll
+9	-29	llvm/test/CodeGen/X86/combine-and.ll
+15	-22	llvm/test/CodeGen/X86/sse2.ll
+12	-24	llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+8	-28	llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+20	-10	llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+20	-10	llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+16	-8	llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
+15	-8	llvm/test/CodeGen/X86/vec_int_to_fp.ll
+11	-11	llvm/test/CodeGen/X86/memset-nonzero.ll
+11	-7	llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+4	-14	llvm/test/CodeGen/X86/combine-sdiv.ll
+4	-14	llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+10	-5	llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+5	-10	llvm/test/CodeGen/X86/combine-udiv.ll
+6	-8	llvm/test/CodeGen/X86/win_cst_pool.ll
+7	-7	llvm/test/CodeGen/X86/vector-trunc.ll
+8	-5	llvm/test/CodeGen/X86/ssub_sat_vec.ll
+8	-5	llvm/test/CodeGen/X86/sadd_sat_vec.ll
+6	-6	llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+8	-4	llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+7	-4	llvm/test/CodeGen/X86/psubus.ll
+5	-5	llvm/test/CodeGen/X86/masked_store_trunc.ll
+5	-4	llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
+4	-4	llvm/test/CodeGen/X86/vec_fabs.ll
+5	-3	llvm/test/CodeGen/X86/oddshuffles.ll
+4	-4	llvm/test/CodeGen/X86/known-bits-vector.ll
+4	-4	llvm/test/CodeGen/X86/fp-round.ll
+4	-4	llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+4	-2	llvm/test/CodeGen/X86/vec_fp_to_int.ll
+4	-2	llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+4	-2	llvm/test/CodeGen/X86/sshl_sat_vec.ll
+4	-2	llvm/test/CodeGen/X86/vec_anyext.ll
+4	-2	llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+4	-2	llvm/test/CodeGen/X86/avx2-conversions.ll
+4	-2	llvm/test/CodeGen/X86/vselect-zero.ll
+3	-2	llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+3	-2	llvm/test/CodeGen/X86/pr38639.ll
+3	-2	llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+3	-2	llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+2	-2	llvm/test/CodeGen/X86/merge-store-constants.ll
+2	-2	llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+2	-2	llvm/test/CodeGen/X86/paddus.ll
+2	-2	llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
+2	-2	llvm/test/CodeGen/X86/cast-vsel.ll
+2	-2	llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+2	-2	llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+2	-2	llvm/test/CodeGen/X86/bitreverse.ll
+2	-2	llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+2	-2	llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+2	-2	llvm/test/CodeGen/X86/vector-fshl-256.ll
+2	-2	llvm/test/CodeGen/X86/pr32368.ll
+2	-2	llvm/test/CodeGen/X86/avx-vbroadcast.ll
+2	-2	llvm/test/CodeGen/X86/vector-fshr-256.ll
+2	-1	llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
+2	-1	llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+2	-1	llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+2	-1	llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+2	-1	llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+2	-1	llvm/test/CodeGen/X86/sat-add.ll
+2	-1	llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+1	-1	llvm/test/CodeGen/X86/pr30290.ll
+1	-1	llvm/test/CodeGen/X86/splat-const.ll
+1	-1	llvm/test/CodeGen/X86/avx-basic.ll
+1	-1	llvm/test/CodeGen/X86/vselect-avx.ll
+1	-1	llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+1	-1	llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+835	-852	81 files

Unified Split Raw