Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <string.h>
- #include <stdlib.h>
- #include <sys/time.h>
- #include <time.h>
- #include <math.h>
- #include "evaluate.h"
- #include <omp.h>
- void initialize(double** A, double** B, double** C, int* n);
- void* tune_matrix_multiply();
- void matrix_multiply(double* A, double* B, double* C, int n, void* tunning_params);
- void getUserID(char userID[20],char userName[30]);
- int main(int argc, char *argv[]){
- srand(time(NULL));
- void* tunning_params = tune_matrix_multiply();
- int n;
- double *A, *B, *C, tolerance = pow(10,-7);
- char userID[20];
- char userName[30];
- getUserID(userID,userName);
- if (debug_run(argc, argv,userID,userName)) { /* This is debug run for students to test their code. Do not edit this. */
- initialize(&A, &B, &C, &n);
- double etOpenMPst, etOpenMPsp;
- etOpenMPst = omp_get_wtime();
- matrix_multiply(A, B, C, n, tunning_params); /* Multiply B and C to store the results into A <- B * C */
- etOpenMPsp = omp_get_wtime();
- double total_flops = myCalcFlops(etOpenMPst, etOpenMPsp, n);
- printf("Flops : %lf\n",total_flops);
- int check = check_Correctness(A, B, C, n, tolerance);
- if(check) printf("Correct : true \n");
- else printf("Correct : false \n");
- free(A);
- free(B);
- free(C);
- } else{
- /* This is an evaluation run
- * and it will be used for evaluation of student answers
- * so dont wory about the code in the else block.
- */
- int sizes[4];
- find_matrix_sizes(argc, argv, sizes);
- double limit = 100.0, flops[4]= {0.0,0.0,0.0,0.0};
- int s, correctness[4] = {0,0,0,0};
- for(s=0;s<4;s++){
- int n0=sizes[s];
- //allocate memory of A, B, C
- A = (double *) malloc(n0*n0*sizeof(double));
- B = (double *) malloc(n0*n0*sizeof(double));
- C = (double *) malloc(n0*n0*sizeof(double));
- int j, i, k;
- for(j=0;j<5;j++){
- // Randomly initializing elements B & C from 0-limit
- /* B is a Lower Triangular Matrix and C is a n/16 banded Matrix */
- for(i=0;i<n0;i++){
- for(k=0;k<n0;k++){
- *(A + n0*i + k) = 0.0;
- if (k>i) *(B + n0*i + k) = 0.0;
- else *(B + n0*i + k) = (double)rand()/(double)(RAND_MAX/limit);
- if (k>i+n0/16-1 || i>k+n0/16-1) *(C + n0*i + k) = 0.0;
- else *(C + n0*i + k) = (double)rand()/(double)(RAND_MAX/limit);
- }
- }
- if(j==0){
- double etOpenMPst, etOpenMPsp;
- etOpenMPst = omp_get_wtime();
- matrix_multiply(A, B, C, n0, tunning_params);
- etOpenMPsp = omp_get_wtime();
- }
- else{
- double etOpenMPst, etOpenMPsp;
- etOpenMPst = omp_get_wtime();
- matrix_multiply(A, B, C, n0, tunning_params);
- etOpenMPsp = omp_get_wtime();
- flops[s] += myCalcFlops(etOpenMPst,etOpenMPsp, n0);
- int check = check_Correctness(A, B, C, n0, tolerance);
- if(check==1){
- correctness[s]++;
- }
- }
- }
- flops[s] = flops[s]/4.0;
- free(A);
- free(B);
- free(C);
- }
- double overall_flop_score = (flops[0] + 2 * flops[1] + 3 * flops[2] + 4 * flops[3])/10;
- int q,overall_correctness = 1;
- for(q=0;q<4;q++){
- if(correctness[q] != 4){
- overall_correctness = 0;
- }
- }
- printf("UserName:%s\tuserID:%s\n",userName,userID);
- printf("Size\tcorrectness\tflops\n");
- printf("small \t\t%d\t%lf\n",correctness[0],flops[0]);
- printf("medium \t\t%d\t%lf\n",correctness[1],flops[1]);
- printf("large \t\t%d\t%lf\n",correctness[2],flops[2]);
- printf("extralarge \t%d\t%lf\n",correctness[3],flops[3]);
- printf("\n");
- printf("Score = %lf\n",overall_correctness * overall_flop_score);
- FILE* outputfile=fopen("output.csv","w");
- fprintf(outputfile, "\"%s\",\"%s\",%d,%lf,%d,%lf,%d,%lf,%d,%lf,%lf",userID,userName,correctness[0],flops[0],correctness[1],flops[1],correctness[2],flops[2],correctness[3],flops[3],overall_correctness * overall_flop_score);
- fclose(outputfile);
- FILE* outputfile2=fopen("output2.csv","w");
- fprintf(outputfile, "\"%s\",%d,%lf,%d,%lf,%d,%lf,%d,%lf,%lf",userName,correctness[0],flops[0],correctness[1],flops[1],correctness[2],flops[2],correctness[3],flops[3],overall_correctness * overall_flop_score);
- fclose(outputfile2);
- }
- return 0;
- }
- //Add only code in the functions
- void getUserID(char userID[20],char userName[30]){
- //Enter your moodle userID here i.e. your entry number
- strcpy(userID,"cs1120202");
- //Enter your name here
- strcpy(userName,"Aashish Goyal");
- }
- void initialize(double** A, double** B, double** C, int* n){
- /*
- Initialize matrix A , B & C where each of them is a square matrix of size n*n
- Try 4 sizes of matrix:
- a) small - should fit in the L1 cache
- b) medium - should fit in the L2 cache
- b) large - should fit in the L3 cache
- b) extra-large - should fit in the memory
- */
- *n=1024;
- *A = (double *) malloc(*n**n*sizeof(double));
- *B = (double *) malloc(*n**n*sizeof(double));
- *C = (double *) malloc(*n**n*sizeof(double));
- int i, k;
- for(i=0;i<*n;i++){
- for(k=0;k<*n;k++){
- *(*A + *n*i + k) = 0.0;
- if(k>i) *(*B + *n*i + k) = 0.0;
- else *(*B + *n*i + k) = (double)rand()/(double)(RAND_MAX/100.0);
- if(k>i+*n/16-1 || i>k+*n/16-1) *(*C + *n*i + k) = 0.0;
- else *(*C + *n*i + k) = (double)rand()/(double)(RAND_MAX/100.0);
- }
- }
- }
- void* tune_matrix_multiply(){
- /* Write code to test various methods to optimize the matrix multiplication
- and store these parameters in a data structure(for ex, array of values ). This
- data structure can be returned by this function and will be passed to the
- matrix multiply function so that it can efficiently multiply the matrices.
- */
- printf("Number of procs %d \n",omp_get_num_procs() );
- // omp_set_num_threads(omp_get_num_procs() - 2);
- int n = 1024;
- int i,j,k;
- int* mapper = (int*)malloc(sizeof(int) *(n*(n+1)));
- #pragma omp parallel for private(i,j,k) shared(n, mapper) default(none) collapse(1)
- for(i = 0; i < n; i++){
- for(j = 0, k = (i*(i+1))/2 ; j < i+1; j++){
- mapper[(k+j)*2] = i;
- mapper[(k+j)*2 + 1] = j;
- }
- }
- mapper[0] = omp_get_num_procs() + 1;
- // return (void*)mapper;
- return mapper;
- }
- #define CR(x,y) ((x*n)+y)
- #define CR2(x,y) ((x<<d)+y)
- #define MIN(x,y) (((x)<(y)) ? (x):(y) )
- #define MAX(x,y) (((x)>(y)) ? (x):(y) )
- void matrix_multiply(double* A, double* B, double* C, int n, void* tunning_params){
- // Multiply matrix B & C and store the results in A
- // CORRECTNESS IS ENSURED AS PARALLELISM ONLY ON i
- // printf("number of threads %d \n", omp_get_num_threads());
- // on gcl computer able to get 23 Gflops with this code
- // register int i,j,k;
- // #pragma omp parallel private(i,j,k) shared(A,B,C,n) default(none)
- // {
- // #pragma omp for schedule(auto)
- // for(i = 0; i < n; i++){
- // for(k = 0; k < n; k++){
- // for(j = 0; j < n; j++){
- // A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- // }
- // }
- // }
- // }
- // int i,j,k;
- // double *D, *E, *F;
- // printf("%d\n" , n);
- // #pragma omp parallel private(i,j,k,D,E,F) shared(A,B,C,n) default(none)
- // {
- // // #pragma omp for schedule(auto)
- // #pragma omp parallel for schedule(static) private(i,j,k,D,E,F) shared(A,B,C,n) default(none) collapse(2)
- // for(i = 0; i < n; i++){
- // for(k = 0, E = (B + (i*n)), F = C; k < n; k++, E++){
- // for(j = 0, D = (A + (i*n)); j < n; j++,D++,F++){
- // *D += (*E) * (*F);
- // }
- // }
- // }
- // }
- int *tune = tunning_params;
- // current best
- int i,j,k;
- int m = n;
- // fresh code
- switch (n){
- case 1024:{
- // #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) schedule(guided)
- // #pragma omp parallel for schedule(dynamic) num_threads(tune[0])
- // for(i = n-1; i >= 0; i--){
- // F = A + (i << d);
- // G = B + (i << d);
- // for(k = 0; k < i + 1; k++){
- // // double reg = B[CR2(i,k)];
- // double reg = *(G + k);
- // const int val1 = MAX(0, k-n_16+1);
- // const int val2 = MIN(n, k+n_16);
- // for(j = val1, E = F + val1, H = C + CR2(k,val1); j < val2; j++, E++, H++){
- // // *E += reg * C[CR2(k,j)];
- // *E += reg * (*H);
- // }
- // }
- // }
- int n_16 = 1024/16;
- const int n = 1024;
- #pragma omp parallel for schedule(dynamic) num_threads(omp_get_num_procs()+1)
- for(i = n-1; i >= 0; i--){
- for( k = 0; k <= (i); k++){
- const int val1 = MAX(0, k-n_16+1);
- const int val2 = MIN(n, k+n_16);
- // for( j = 0; j < (val2-val1); j++){
- for( j = val1; j < (val2); j++){
- A[(i*n)+(j)] = A[(i*n)+(j)] + B[(i*n)+k]*C[(k*n)+(j)];
- }
- }
- }
- return;
- }
- case 256:{
- // const int n = 256;
- int d = 8;
- // const int p = 256/16;
- #pragma omp parallel for collapse(1) num_threads(9) schedule(dynamic)
- for(i = 0; i < n; i++){
- for(k = 0; k < i+1; k++){
- for(j = MAX(0, k-n/16+1); j < MIN(n, k+n/16); j++){
- A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- }
- }
- }
- return;
- }
- case 64:{
- int l , q , p;
- for(i = 0 ; i < n ; i++){
- for(j = 0 , l = MIN(i+2,n) ,q = MIN(i+3,n) , p = n*i ; j < i + 1 ; j++){
- A[p+j] += B[p + j] * C[n*j + j];
- if(i && j < i) A[p + j] += B[p + j + 1] * C[n*(j+1) + j];
- if(i-1 && j < i-1) A[p + j] += B[p + j + 2] * C[n*(j+2) + j];
- if(i-2 && j < i-2) A[p + j] += B[p + j + 3] * C[n*(j+3) + j];
- if(j + 1 && j + 1 < l) A[p + j + 1] += B[p + j]*C[n*j + j + 1];
- if(j + 2 && j + 2 < q) A[p + j + 2] += B[p + j]*C[n*j + j + 2];
- if(j + 3 && j + 3 < MIN(i+4,n)) A[p + j + 3] += B[p + j]*C[n*j + j + 3];
- }
- }
- return;
- }
- case 32:{
- for(i = 0 ; i < n ; i++){
- int l, p ;
- int d = 5;
- for(j = 0 , l = MIN(i+2,n) , p = i<<d ; j < i + 1 ; j++){
- A[p+j] += B[p + j] * C[n*j + j];
- if(i && j < i) A[p + j] += B[p + j + 1] * C[((j+1)<<d) + j];
- if(j + 1 && j + 1 < l) A[p + j + 1] += B[p + j]*C[(j<<d) + j + 1];
- }
- }
- return;
- }
- default:{
- const int p = n/16;
- #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) collapse(1) schedule(dynamic) num_threads(tune[0])
- for(i = 0; i < n; i++){
- for(k = 0; k < i+1; k++){
- for(j = MAX(0, k-p+1); j < MIN(n, k+p); j++){
- A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- }
- }
- }
- return;
- }
- }
- // if (n > 300)
- // {
- // const int n = m;
- // const int n_16 = n/16;
- // // #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) schedule(guided)
- // #pragma omp parallel for schedule(dynamic) num_threads(tune[0])
- // for(i = n-1; i >= 0 ; i--){
- // for(k = 0; k < i+1; k++){
- // double register reg = B[CR(i,k)];
- // const int val1 = MAX(0, k-n_16+1);
- // const int val2 = MIN(n, k+n_16);
- // for(j = val1; j < val2; j++){
- // A[CR(i,j)] += reg * C[CR(k,j)];
- // }
- // }
- // }
- // }
- // else if (n > 100){
- // const int p = n/16;
- // #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) collapse(1) schedule(dynamic) num_threads(tune[0])
- // for(i = 0; i < n; i++){
- // for(k = 0; k < i+1; k++){
- // for(j = MAX(0, k-p+1); j < MIN(n, k+p); j++){
- // A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- // }
- // }
- // }
- // }
- // else {
- // // #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) collapse(1) schedule(dynamic)
- // // for(i = 0; i < n; i++){
- // // for(k = 0; k < i+1; k++){
- // // for(j = MAX(0, k-n/16+1); j < MIN(n, k+n/16); j++){
- // // A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- // // }
- // // }
- // // }
- // if(n == 32){
- // for(i = 0 ; i < n ; i++){
- // int l, p ;
- // int d = 0;
- // int m = 1;
- // while(m < n){
- // d += 1;
- // m *= 2;
- // }
- // for(j = 0 , l = MIN(i+2,n) , p = i<<d ; j < i + 1 ; j++){
- // A[p+j] += B[p + j] * C[n*j + j];
- // if(i && j < i) A[p + j] += B[p + j + 1] * C[((j+1)<<d) + j];
- // if(j + 1 && j + 1 < l) A[p + j + 1] += B[p + j]*C[(j<<d) + j + 1];
- // }
- // }
- // return;
- // }
- // else if(n == 64){
- // int l , q , p;
- // for(i = 0 ; i < n ; i++){
- // for(j = 0 , l = MIN(i+2,n) ,q = MIN(i+3,n) , p = n*i ; j < i + 1 ; j++){
- // A[p+j] += B[p + j] * C[n*j + j];
- // if(i && j < i) A[p + j] += B[p + j + 1] * C[n*(j+1) + j];
- // if(i-1 && j < i-1) A[p + j] += B[p + j + 2] * C[n*(j+2) + j];
- // if(i-2 && j < i-2) A[p + j] += B[p + j + 3] * C[n*(j+3) + j];
- // if(j + 1 && j + 1 < l) A[p + j + 1] += B[p + j]*C[n*j + j + 1];
- // if(j + 2 && j + 2 < q) A[p + j + 2] += B[p + j]*C[n*j + j + 2];
- // if(j + 3 && j + 3 < MIN(i+4,n)) A[p + j + 3] += B[p + j]*C[n*j + j + 3];
- // }
- // }
- // return;
- // }
- // else{
- // #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) collapse(1) schedule(dynamic)
- // for(i = 0; i < n; i++){
- // for(k = 0; k < i+1; k++){
- // for(j = MAX(0, k-n/16+1); j < MIN(n, k+n/16); j++){
- // A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- // }
- // }
- // }
- // }
- // }
- // next
- // int i,j,k;
- // // int mapper[(n*(n+1))];
- // // #pragma omp parallel for private(i,j,k) shared(n, mapper) default(none) collapse(1)
- // // for(i = 0; i < n; i++){
- // // for(j = 0, k = (i*(i+1))/2 ; j < i+1; j++){
- // // mapper[(k+j)*2] = i;
- // // mapper[(k+j)*2 + 1] = j;
- // // }
- // // }
- // int *mapper = (int*) tunning_params;
- // // for(i = 0; i < 10; i++){
- // // printf("%d ", mapper[2*i + 1]);
- // // }printf("\n");
- // int l;
- // int m;
- // #pragma omp parallel for private(i,j,k,l) shared(A,B,C,n,mapper) default(none) collapse(1) schedule()
- // for(l = 0; l < n*(n+1)/2; l++){
- // i = mapper[2*(l)];
- // k = mapper[2*(l) + 1];
- // for(j = MAX(0, k-n/16+1); j < MIN(n, k+n/16); j++){
- // #pragma omp atomic
- // A[CR(i,j)] += B[CR(i,k)] * C[CR(k,j)];
- // }
- // }
- // int i,j,k;
- // for(i = 0; i < n; i++){
- // for(j = MAX(0, i - n/16 + 1); j < i; j++){
- // double tmp = C[CR(i,j)];
- // C[CR(i,j)] = C[CR(j,i)];
- // C[CR(j,i)] = tmp;
- // }
- // }
- // {
- // #pragma omp parallel for private(i,j,k) shared(A,B,C,n) default(none) collapse(2) schedule(dynamic)
- // for(i = 0; i < n; i++){
- // for( j = 0; j < (n); j++ ){
- // double register val = 0;
- // for( k = MAX(0, j-n/16+1); k < MIN(i+1, j+n/16); k++){
- // val += B[CR(i,k)] * C[CR(j,k)];
- // }
- // A[CR(i,j)] = val;
- // }
- // }
- // }
- // for(i = 0; i < n; i++){
- // for(j = MAX(0, i - n/16 + 1); j < i; j++){
- // double tmp = C[CR(i,j)];
- // C[CR(i,j)] = C[CR(j,i)];
- // C[CR(j,i)] = tmp;
- // }
- // }
- // int i,j,k;
- // int m = n;
- // {
- // const int n = m;
- // double *D, *E, *F;
- // #pragma omp parallel for default(none) private(i, j, k, D, E, F) shared(A, B, C)
- // for(i = 0; i < n ; i++){
- // for(k = 0, E = (B + (i*n)); k < i + 1; k++, E++){
- // for (j = MAX(0, k-n/16+1), D = (A + (i * n) + j), F = (C + (k * n) +j) ; j < MIN(n,k+n/1); j++, D++, F++){
- // *D += (*E) * (*F);
- // }
- // }
- // }
- // }
- }
- // #pragma GCC pop_options
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement