Parallel Programming Exercises Andreas Hollmann Technische Universität München June 06, 2014 Organization I Andreas Wilhelm will do the Dependcy Analysis I Tutorial will take place next Wednesday (June 11) instead of the lecture. No lecture next week. I Two lectures the week after: June 17 and 18 I MPI tutorial will start on June 24 Merge Sort with OpenMP Tasks and Sections 1 / 5 I The assignments consists of two parts I Use the provided OpenMP implementation. I Parallelize it with OpenMP tasks and try to optimize it. I Reduce the overhead. Use the variable threshold. I We accept only solution that do not create more tasks then necessary for a given threshold. I --- I Compare it to the parallelization with OpenMP Sections and nested parallelism. I You can define the nesting level with omp_set_max_active_levels(n) I Enable nesting with omp_set_nested(1) I Use the threshold for defining the nesting level. Merge Sort with OpenMP Tasks and Sections 2 / 5 Assignment: Merge Sort with OpenMP Tasks and Sections 3 / 5 1 2 3 4 5 void merge_sort ( int64_t * a , s i z e _ t num_elements , i n t num_threads , i n t threshold ) { s i z e _ t size = num_elements * sizeof ( int64_t ) ; int64_t * b = malloc ( size ) ; 6 i f (b == NULL) e x i t ( EXIT_FAILURE ) ; 7 8 9 memcpy(b , a , size ) ; 10 11 s p l i t ( a , b , 0 , num_elements , threshold ) ; 12 13 free (b ) ; 14 15 } Merge Sort with OpenMP Tasks and Sections 4 / 5 1 2 3 4 5 void s p l i t ( int64_t * a , int64_t * b , s i z e _ t begin , s i z e _ t end , i n t threshold ) { i f (end − begin < 2) return ; 6 swap(&a , &b ) ; 7 8 s i z e _ t mid = ( begin + end) / 2; 9 10 s p l i t ( a , b , begin , mid , threshold ) ; s p l i t ( a , b , mid , end , threshold ) ; 11 12 13 merge( a , b , begin , mid , end ) ; 14 15 } Merge Sort with OpenMP Tasks and Sections 5 / 5 1 2 3 4 void merge( int64_t * a , int64_t * b , s i z e _ t begin , s i z e _ t mid , s i z e _ t end) { s i z e _ t l = begin , r = mid , idx = begin ; 5 while ( l < mid && r < end) b[ idx++] = a[ l ] < a[ r ] ? a[ l++] : a[ r++]; 6 7 8 while ( l < mid) b[ idx++] = a[ l ++]; 9 10 11 while ( r < end) b[ idx++] = a[ r++]; 12 13 14 } Solution: Merge Sort with OpenMP Tasks 1 / 2 1 2 3 4 void merge_sort ( int64_t * a , s i z e _ t num_elements , i n t num_threads , i n t threshold ) { omp_set_num_threads ( num_threads ) ; 5 s i z e _ t size = num_elements * sizeof ( int64_t ) ; int64_t * b = malloc ( size ) ; 6 7 8 i f (b == NULL) e x i t ( EXIT_FAILURE ) ; 9 10 11 memcpy(b , a , size ) ; 12 13 #pragma omp p a r a l l e l #pragma omp single s p l i t ( a , b , 0 , num_elements , threshold ) ; 14 15 16 17 free (b ) ; 18 19 } Solution: Merge Sort with OpenMP Tasks 2 / 2 1 2 3 4 5 void s p l i t ( int64_t * a , int64_t * b , s i z e _ t begin , s i z e _ t end , i n t threshold ) { i f (end − begin < 2) return ; 6 swap(&a , &b ) ; 7 8 s i z e _ t mid = ( begin + end) / 2; 9 10 #pragma omp task i f (end − begin > threshold ) s p l i t ( a , b , begin , mid , threshold ) ; s p l i t ( a , b , mid , end , threshold ) ; 11 12 13 14 #pragma omp taskwait merge( a , b , begin , mid , end ) ; 15 16 17 } Solution: Merge Sort with OpenMP Sections 1 / 2 1 2 3 4 5 void merge_sort ( int64_t * a , s i z e _ t num_elements , i n t num_threads , i n t threshold ) { s i z e _ t size = num_elements * sizeof ( int64_t ) ; int64_t * b = malloc ( size ) ; 6 i f (b == NULL) e x i t ( EXIT_FAILURE ) ; 7 8 9 memcpy(b , a , size ) ; 10 11 omp_set_num_threads ( num_threads ) ; omp_set_nested ( 1 ) ; omp_set_max_active_levels ( threshold ) ; 12 13 14 15 s p l i t ( a , b , 0 , num_elements , threshold ) ; 16 17 free (b ) ; 18 19 } Solution: Merge Sort with OpenMP Sections 2 / 2 1 2 3 4 void s p l i t ( int64_t * a , int64_t * b , s i z e _ t begin , s i z e _ t end , i n t threshold ) { i f (end − begin < 2) return ; 5 swap(&a , &b ) ; 6 7 s i z e _ t mid = ( begin + end) / 2; 8 9 #pragma omp p a r a l l e l sections { #pragma omp section s p l i t ( a , b , begin , mid , threshold ) ; 10 11 12 13 14 #pragma omp section s p l i t ( a , b , mid , end , threshold ) ; 15 16 } merge( a , b , begin , mid , end ) ; 17 18 19 } Which level to parallelize? -> Dependency Analysis 1 2 3 4 5 6 7 8 9 10 f o r ( i n t i = 1; i < I_MAX ; i++) { f o r ( i n t j = 1; j < J_MAX ; j++) { f o r ( i n t k = 1; k < K_MAX; k++) { A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ; } } } Which level to parallelize? -> Dependency Analysis 1 2 3 4 5 6 7 8 9 10 11 f o r ( i n t i = 1; i < I_MAX ; i++) { f o r ( i n t j = 1; j < J_MAX ; j++) { #pragma omp p a r a l l e l f o r f o r ( i n t k = 1; k < K_MAX; k++) { A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ; } } } Which level to parallelize? -> Dependency Analysis 1 2 3 4 5 6 7 8 9 10 11 f o r ( i n t i = 1; i < I_MAX ; i++) { f o r ( i n t j = 1; j < J_MAX ; j++) { #pragma omp p a r a l l e l f o r / / <−−− What happens here? f o r ( i n t k = 1; k < K_MAX; k++) { A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ; } / / <−−− What happens here? } } Which level to parallelize? -> Dependency Analysis 1 2 3 4 5 6 7 8 9 10 11 12 #pragma omp p a r a l l e l f o r ( i n t i = 1; i < I_MAX ; i++) { f o r ( i n t j = 1; j < J_MAX ; j++) { #pragma omp f o r f o r ( i n t k = 1; k < K_MAX; k++) { A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ; } / / <−−− What happens here? } } Reducing the Overhead of inner Loop Parallelization I Starting a parallel region and synchronizing it in the end is costly I Use ‘nowait‘ clause reduces the overhead I ‘nowait‘ only works with static scheduling I Removes the synchronization at the end of a #pragma omp for Using ‘nowait‘ in inner Loop 1 2 3 4 5 6 7 8 9 10 11 12 #pragma omp p a r a l l e l f o r ( i n t i = 1; i < I_MAX ; i++) { f o r ( i n t j = 1; j < J_MAX ; j++) { #pragma omp f o r nowait f o r ( i n t k = 1; k < K_MAX; k++) { A[ i ] [ j ] [ k ] = 2 * A[ i −1][ j −1][k ] ; } } } Using ‘nowait‘ in inner Loop 1 2 3 #define I_MAX 1000 #define J_MAX 1000 #define K_MAX 10 4 5 6 7 $ . / omp_inner_loop Inner Loop with nowait takes : 0.121344 seconds Inner Loop without nowait takes : 1.163177 seconds
© Copyright 2025 ExpyDoc