presentation-inlined.c

Line Hotness Pass Source Function / Inlining Chain
1
// Taken from Adam Nemet's November 2016 LLVM talk
2
3
#include "shared.h"
4
5
void accumulate (int x, int *a)
6
{
7
  *a += x;
8
}
9
10
int compute_sum_with_inlining (int arr[], int n)
11
{
12
  int sum = 0;
13
  for (int i = 0; i < n; ++i)
100.00 ldist
  ^Loop 1 not distributed.
  
  • compute_sum_with_inlining
100.00 vect

                        
^=== analyzing loop === Analyzing loop at presentation-inlined.c:13 === analyze_loop_nest === === vect_analyze_loop_form === === get_loop_niters === Symbolic number of iterations is (unsigned int) n_8(D) === vect_analyze_data_refs === got vectype for stmt: _4 = *_3;vector(4) int === vect_analyze_scalar_cycles === Analyze phi: i_17 = PHI <0(5), i_10(6)> Access function of PHI: {0, +, 1}_1 step: 1, init: 0 Detected induction. Analyze phi: sum_20 = PHI <0(5), _12(6)> Access function of PHI: {0, +, _4}_1 step: _4, init: 0 step unknown. Analyze phi: sum_20 = PHI <0(5), _12(6)> detected reduction: _12 = _4 + sum_20; Detected reduction. === vect_pattern_recog === vect_is_simple_use: operand _1 def_stmt: _1 = (long unsigned int) i_17; type of def: internal vect_is_simple_use: operand i_17 def_stmt: i_17 = PHI <0(5), i_10(6)> type of def: induction vect_is_simple_use: operand 4 vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal === vect_analyze_data_ref_accesses === === vect_mark_stmts_to_be_vectorized === init: phi relevant? i_17 = PHI <0(5), i_10(6)> init: phi relevant? sum_20 = PHI <0(5), _12(6)> init: stmt relevant? # DEBUG sum => sum_20 init: stmt relevant? # DEBUG i => i_17 init: stmt relevant? # DEBUG BEGIN_STMT init: stmt relevant? _1 = (long unsigned int) i_17; init: stmt relevant? _2 = _1 * 4; init: stmt relevant? _3 = arr_9(D) + _2; init: stmt relevant? _4 = *_3; init: stmt relevant? # DEBUG x => _4 init: stmt relevant? # DEBUG a => &sum init: stmt relevant? # DEBUG BEGIN_STMT init: stmt relevant? _12 = _4 + sum_20; vec_stmt_relevant_p: used out of loop. vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal vec_stmt_relevant_p: stmt live but not relevant. mark relevant 1, live 1: _12 = _4 + sum_20; init: stmt relevant? # DEBUG sum => _12 init: stmt relevant? # DEBUG x => NULL init: stmt relevant? # DEBUG a => NULL init: stmt relevant? i_10 = i_17 + 1; init: stmt relevant? # DEBUG i => i_10 init: stmt relevant? # DEBUG sum => _12 init: stmt relevant? # DEBUG i => i_10 init: stmt relevant? if (n_8(D) > i_10) worklist: examine stmt: _12 = _4 + sum_20; vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal mark relevant 1, live 0: _4 = *_3; vect_is_simple_use: operand sum_20 def_stmt: sum_20 = PHI <0(5), _12(6)> type of def: reduction mark relevant 1, live 0: sum_20 = PHI <0(5), _12(6)> worklist: examine stmt: sum_20 = PHI <0(5), _12(6)> vect_is_simple_use: operand 0 vect_is_simple_use: operand _12 def_stmt: _12 = _4 + sum_20; type of def: reduction reduc-stmt defining reduc-phi in the same nest. worklist: examine stmt: _4 = *_3; === vect_analyze_data_ref_dependences === === vect_determine_vectorization_factor === ==> examining phi: i_17 = PHI <0(5), i_10(6)> ==> examining phi: sum_20 = PHI <0(5), _12(6)> get vectype for scalar type: int vectype: vector(4) int nunits = 4 ==> examining statement: # DEBUG sum => sum_20 skip. ==> examining statement: # DEBUG i => i_17 skip. ==> examining statement: # DEBUG BEGIN_STMT skip. ==> examining statement: _1 = (long unsigned int) i_17; skip. ==> examining statement: _2 = _1 * 4; skip. ==> examining statement: _3 = arr_9(D) + _2; skip. ==> examining statement: _4 = *_3; get vectype for scalar type: int vectype: vector(4) int nunits = 4 ==> examining statement: # DEBUG x => _4 skip. ==> examining statement: # DEBUG a => &sum skip. ==> examining statement: # DEBUG BEGIN_STMT skip. ==> examining statement: _12 = _4 + sum_20; get vectype for scalar type: int vectype: vector(4) int get vectype for scalar type: int vectype: vector(4) int nunits = 4 ==> examining statement: # DEBUG sum => _12 skip. ==> examining statement: # DEBUG x => NULL skip. ==> examining statement: # DEBUG a => NULL skip. ==> examining statement: i_10 = i_17 + 1; skip. ==> examining statement: # DEBUG i => i_10 skip. ==> examining statement: # DEBUG sum => _12 skip. ==> examining statement: # DEBUG i => i_10 skip. ==> examining statement: if (n_8(D) > i_10) skip. vectorization factor = 4 === vect_analyze_slp === === vect_make_slp_decision === === vect_analyze_data_refs_alignment === recording new base alignment for arr_9(D) alignment: 4 misalignment: 0 based on: _4 = *_3; vect_compute_data_ref_alignment: can't force alignment of ref: *_3 === vect_prune_runtime_alias_test_list === === vect_enhance_data_refs_alignment === Unknown misalignment, naturally aligned vect_can_advance_ivs_p: Analyze phi: i_17 = PHI <0(5), i_10(6)> Analyze phi: sum_20 = PHI <0(5), _12(6)> reduc or virtual phi. skip. vect_model_load_cost: aligned. vect_get_data_access_cost: inside_cost = 12, outside_cost = 0. cost model: epilogue peel iters set to vf/2 because loop iterations are unknown . vect_model_load_cost: unaligned supported by hardware. vect_get_data_access_cost: inside_cost = 12, outside_cost = 0. cost model: epilogue peel iters set to vf/2 because loop iterations are unknown . Vectorizing an unaligned access. === vect_analyze_loop_operations === examining phi: i_17 = PHI <0(5), i_10(6)> examining phi: sum_20 = PHI <0(5), _12(6)> ==> examining statement: # DEBUG sum => sum_20 irrelevant. ==> examining statement: # DEBUG i => i_17 irrelevant. ==> examining statement: # DEBUG BEGIN_STMT irrelevant. ==> examining statement: _1 = (long unsigned int) i_17; irrelevant. ==> examining statement: _2 = _1 * 4; irrelevant. ==> examining statement: _3 = arr_9(D) + _2; irrelevant. ==> examining statement: _4 = *_3; vect_is_simple_use: operand *_3 not ssa-name. use not simple. vect_is_simple_use: operand *_3 not ssa-name. use not simple. can't use a fully-masked loop because the target doesn't have the appropriate masked load or store. vect_model_load_cost: unaligned supported by hardware. vect_model_load_cost: inside_cost = 12, prologue_cost = 0 . ==> examining statement: # DEBUG x => _4 irrelevant. ==> examining statement: # DEBUG a => &sum irrelevant. ==> examining statement: # DEBUG BEGIN_STMT irrelevant. ==> examining statement: _12 = _4 + sum_20; vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal vect_is_simple_use: operand sum_20 def_stmt: sum_20 = PHI <0(5), _12(6)> type of def: reduction reduc op not supported by target. vect_model_reduction_cost: inside_cost = 4, prologue_cost = 4, epilogue_cost = 20 . ==> examining statement: # DEBUG sum => _12 irrelevant. ==> examining statement: # DEBUG x => NULL irrelevant. ==> examining statement: # DEBUG a => NULL irrelevant. ==> examining statement: i_10 = i_17 + 1; irrelevant. ==> examining statement: # DEBUG i => i_10 irrelevant. ==> examining statement: # DEBUG sum => _12 irrelevant. ==> examining statement: # DEBUG i => i_10 irrelevant. ==> examining statement: if (n_8(D) > i_10) irrelevant. not using a fully-masked loop. cost model: epilogue peel iters set to vf/2 because loop iterations are unknown . Cost model analysis: Vector inside of loop cost: 16 Vector prologue cost: 36 Vector epilogue cost: 52 Scalar iteration cost: 16 Scalar outside cost: 32 Vector outside cost: 88 prologue iterations: 0 epilogue iterations: 2 Calculated minimum iters for profitability: 5 Runtime profitability threshold = 5 Static estimate profitability threshold = 9 epilog loop required vect_can_advance_ivs_p: Analyze phi: i_17 = PHI <0(5), i_10(6)> Analyze phi: sum_20 = PHI <0(5), _12(6)> reduc or virtual phi. skip. loop vectorized === vec_transform_loop === Profitability threshold is 5 loop iterations. vect_can_advance_ivs_p: Analyze phi: i_17 = PHI <i_10(6), 0(9)> Analyze phi: sum_20 = PHI <_12(6), 0(9)> reduc or virtual phi. skip. vect_update_ivs_after_vectorizer: phi: i_17 = PHI <i_10(6), 0(9)> vect_update_ivs_after_vectorizer: phi: sum_20 = PHI <_12(6), 0(9)> reduc or virtual phi. skip. ------>vectorizing phi: i_17 = PHI <i_10(6), 0(16)> ------>vectorizing phi: sum_20 = PHI <_12(6), 0(16)> transform phi. ------>vectorizing phi: vect__12.4_33 = PHI <(6), (16)> ------>vectorizing statement: # DEBUG sum => sum_20 ------>vectorizing statement: # DEBUG i => i_17 ------>vectorizing statement: # DEBUG BEGIN_STMT ------>vectorizing statement: _1 = (long unsigned int) i_17; ------>vectorizing statement: _2 = _1 * 4; ------>vectorizing statement: _3 = arr_9(D) + _2; ------>vectorizing statement: _4 = *_3; transform statement. transform load. ncopies = 1 create vector_type-pointer variable to type: vector(4) int vectorizing a pointer ref: *arr_9(D) created arr_9(D) add new stmt: vect__4.7_36 = MEM[(int *)vectp_arr.5_34]; ------>vectorizing statement: # DEBUG x => _4 ------>vectorizing statement: # DEBUG a => &sum ------>vectorizing statement: # DEBUG BEGIN_STMT ------>vectorizing statement: _12 = _4 + sum_20; transform statement. vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal vect_is_simple_use: operand sum_20 def_stmt: sum_20 = PHI <_12(6), 0(16)> type of def: reduction reduc op not supported by target. transform reduction. vect_get_vec_def_for_operand: _4 vect_is_simple_use: operand _4 def_stmt: _4 = *_3; type of def: internal def_stmt = _4 = *_3; vect_get_vec_def_for_operand: sum_20 vect_is_simple_use: operand sum_20 def_stmt: sum_20 = PHI <_12(6), 0(16)> type of def: reduction def_stmt = sum_20 = PHI <_12(6), 0(16)> add new stmt: vect__12.8_37 = vect__4.7_36 + vect__12.4_33; vect_is_simple_use: operand 0 transform reduction: created def-use cycle: vect__12.4_33 = PHI <vect__12.8_37(6), { 0, 0, 0, 0 }(16)>vect__12.8_37 = vect__4.7_36 + vect__12.4_33; Reduce using vector shifts extract scalar result ------>vectorizing statement: # DEBUG sum => _12 ------>vectorizing statement: # DEBUG x => NULL ------>vectorizing statement: # DEBUG a => NULL ------>vectorizing statement: i_10 = i_17 + 1; ------>vectorizing statement: # DEBUG i => i_10 ------>vectorizing statement: # DEBUG sum => _12 ------>vectorizing statement: # DEBUG i => i_10 ------>vectorizing statement: vectp_arr.5_35 = vectp_arr.5_34 + 16; ------>vectorizing statement: if (n_8(D) > i_10) New loop exit condition: if (ivtmp_46 < bnd.1_29) LOOP VECTORIZED
  • compute_sum_with_inlining
14
    accumulate (arr[i], &sum);
einline
    ^ Inlining accumulate/0 into compute_sum_with_inlining/1.
    
  • compute_sum_with_inlining
einline
    ^Inlining accumulate/2 to compute_sum_with_inlining/1 with frequency 1.00
    
  • compute_sum_with_inlining
15
  return sum;
16
}