/*************************************************************************/ /* */ /* Copyright (c) 1994 Stanford University */ /* */ /* All rights reserved. */ /* */ /* Permission is given to use, copy, and modify this software for any */ /* non-commercial purpose as long as this copyright notice is not */ /* removed. All other uses, including redistribution in whole or in */ /* part, are forbidden without prior written permission. */ /* */ /* This software is provided with absolutely no warranty and no */ /* support. */ /* */ /*************************************************************************/ /*************************************************************************/ /* */ /* Perform 1D fast Fourier transform using six-step FFT method */ /* */ /* 1) Performs staggered, blocked transposes for cache-line reuse */ /* 2) Roots of unity rearranged and distributed for only local */ /* accesses during application of roots of unity */ /* 3) Small set of roots of unity elements replicated locally for */ /* 1D FFTs (less than root N elements replicated at each node) */ /* 4) Matrix data structures are padded to reduce cache mapping */ /* conflicts */ /* */ /* Command line options: */ /* */ /* -mM : M = even integer; 2**M total complex data points transformed. */ /* -pP : P = number of processors; Must be a power of 2. */ /* -nN : N = number of cache lines. */ /* -lL : L = Log base 2 of cache line length in bytes. */ /* -s : Print individual processor timing statistics. */ /* -t : Perform FFT and inverse FFT. Test output by comparing the */ /* integral of the original data to the integral of the data */ /* that results from performing the FFT and inverse FFT. */ /* -o : Print out complex data points. */ /* -h : Print out command line options. */ /* */ /* Note: This version works under both the FORK and SPROC models */ /* */ /*************************************************************************/ #include #include #include #define MAXRAND 2147483647 #define PAGE_SIZE 4096 #define NUM_CACHE_LINES 65536 #define LOG2_LINE_SIZE 4 #define PI 3.1416 #define DEFAULT_M 10 #define DEFAULT_P 1 #include /*---------------------------------------*/ #include #include #include #include #include #include #include #include #include #include #include #include int smp_cyg_test_main(int argc, char **argv); void smp_cyg_test_main_call(void *p) { smp_cyg_test_main(smp_cyg_test_argc, smp_cyg_test_argv); } /* #define STACK_SIZE 0x1b000 */ #define STACK_SIZE 8192 static cyg_thread smp_cyg_test_thread; static char smp_cyg_test_stack[STACK_SIZE]; static cyg_handle_t smp_cyg_test_threadh; cyg_handle_t threadh[64]; char *stack[64]; cyg_thread thread[64]; externC void cyg_user_start( void ) { CYG_TEST_INIT(); diag_printf("Starting test app\n"); cyg_thread_create(10, // Priority - just a number smp_cyg_test_main_call, // entry 0, // index "smp test", // Name smp_cyg_test_stack, // Stack STACK_SIZE, // Size &smp_cyg_test_threadh, // Handle &smp_cyg_test_thread // Thread data structure ); cyg_thread_resume( smp_cyg_test_threadh ); /*cyg_scheduler_start();*/ } /*---------------------------------------*/ #define SWAP(a,b) {double tmp; tmp=a; a=b; b=tmp;} struct GlobalMemory { int id; cyg_spinlock_t idlock; /*---------------------------------------*/ struct startTYP { cyg_spinlock_t lock;; int count[1]; cyg_sem_t queue[1];; } start[1]; /*---------------------------------------*/ int *transtimes; int *totaltimes; int starttime; int finishtime; int initdonetime; } *Global; int dbg_on = 0; int P = DEFAULT_P; int M = DEFAULT_M; int N; /* N = 2^M */ int rootN; /* rootN = N^1/2 */ double *x; /* x is the original time-domain data */ double *trans; /* trans is used as scratch space */ double *umain; /* umain is roots of unity for 1D FFTs */ double *umain2; /* umain2 is entire roots of unity matrix */ int test_result = 0; int doprint = 0; int dostats = 0; int transtime = 0; int transtime2 = 0; int avgtranstime = 0; int avgcomptime = 0; unsigned int transstart = 0; unsigned int transend = 0; int maxtotal=0; int mintotal=0; double maxfrac=0; double minfrac=0; double avgfractime=0; int orig_num_lines = NUM_CACHE_LINES; /* number of cache lines */ int num_cache_lines = NUM_CACHE_LINES; /* number of cache lines */ int log2_line_size = LOG2_LINE_SIZE; int line_size; int rowsperproc; double ck1; double ck3; /* checksums for testing answer */ int pad_length; void SlaveStart(); double TouchArray(double *,double *,double *,double *,int,int,int,int); void FFT1D(int,int,int,double *,double *,double *,double *,int,int *,int, int,int,int,int,int,int,struct GlobalMemory *); double CheckSum(); double drand48(); int log_2(int); void printerr(char *); volatile int thread_array[4] = {0, 0, 0, 0}; int smp_cyg_test_main(int argc, char **argv) { int i; int j; int c; extern char *optarg; int m1; int factor; int pages; unsigned int start; //memset (thread_array,0,sizeof(thread_array)); { (start) = cyg_current_time()*10; } ; // extern Cyg_Mempool_dlmalloc *cygmem_memalloc_heaps[ 2 ]; // cygmem_memalloc_heaps[0]. // printf(" thread %d Arguments (%d): ",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),argc); // for (i = 0;i < argc;i++){ // printf("%s ",argv[i]); // } // printf("\n"); while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) { switch(c) { case 'p': P = atoi(optarg); P = HAL_SMP_CPU_MAX; if (P < 1) { printerr("P must be >= 1\n"); exit(-1); } if (log_2(P) == -1) { printerr("P must be a power of 2\n"); exit(-1); } break; case 'm': M = atoi(optarg); m1 = M/2; if (2*m1 != M) { printerr("M must be even\n"); exit(-1); } break; case 'n': num_cache_lines = atoi(optarg); orig_num_lines = num_cache_lines; if (num_cache_lines < 1) { printerr("Number of cache lines must be >= 1\n"); exit(-1); } break; case 'l': log2_line_size = atoi(optarg); if (log2_line_size < 0) { printerr("Log base 2 of cache line length in bytes must be >= 0\n"); exit(-1); } break; case 's': dostats = !dostats; break; case 't': test_result = !test_result; break; case 'o': doprint = !doprint; break; case 'h': printf("Usage: FFT \n\n"); printf("options:\n"); printf(" -mM : M = even integer; 2**M total complex data points transformed.\n"); printf(" -pP : P = number of processors; Must be a power of 2.\n"); printf(" -nN : N = number of cache lines.\n"); printf(" -lL : L = Log base 2 of cache line length in bytes.\n"); printf(" -s : Print individual processor timing statistics.\n"); printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n"); printf(" integral of the original data to the integral of the data that\n"); printf(" results from performing the FFT and inverse FFT.\n"); printf(" -o : Print out complex data points.\n"); printf(" -h : Print out command line options.\n\n"); printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n", DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE); exit(0); break; } } if (P > 64) { printf("Maximal 64 processes\n"); exit(0); } {;}; N = 1<= P\n"); exit(-1); } line_size = 1 << log2_line_size; if (line_size < 2*sizeof(double)) { printf("WARNING: Each element is a complex double (%d bytes)\n",2*sizeof(double)); printf(" => Less than one element per cache line\n"); printf(" Computing transpose blocking factor\n"); factor = (2*sizeof(double)) / line_size; num_cache_lines = orig_num_lines / factor; } if (line_size <= 2*sizeof(double)) { pad_length = 1; } else { pad_length = line_size / (2*sizeof(double)); } if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) { pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE; if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) { pages ++; } pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc); } else { pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) / (2 * sizeof(double) * rowsperproc); if (pad_length * (2 * sizeof(double) * rowsperproc) != (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) { printerr("Padding algorithm unsuccessful\n"); exit(-1); } } Global = (struct GlobalMemory *) calloc(sizeof(struct GlobalMemory), 1);; if (Global == NULL) { printf("Could not malloc memory for Global (%d)\n",sizeof(struct GlobalMemory)); exit(-1); } x = (double *) calloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE, 1);; trans = (double *) calloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE, 1);; umain = (double *) calloc(2*rootN*sizeof(double), 1);; umain2 = (double *) calloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE, 1);; Global->transtimes = (int *) calloc(P*sizeof(int), 1);; Global->totaltimes = (int *) calloc(P*sizeof(int), 1);; if (x == NULL) { printerr("Could not malloc memory for x\n"); exit(-1); } else if (trans == NULL) { printerr("Could not malloc memory for trans\n"); exit(-1); } else if (umain == NULL) { printerr("Could not malloc memory for umain\n"); exit(-1); } else if (umain2 == NULL) { printerr("Could not malloc memory for umain2\n"); exit(-1); } x = (double *) (((unsigned) x) + PAGE_SIZE - ((unsigned) x) % PAGE_SIZE); trans = (double *) (((unsigned) trans) + PAGE_SIZE - ((unsigned) trans) % PAGE_SIZE); umain2 = (double *) (((unsigned) umain2) + PAGE_SIZE - ((unsigned) umain2) % PAGE_SIZE); /* In order to optimize data distribution, the data structures x, trans, and umain2 have been aligned so that each begins on a page boundary. This ensures that the amount of padding calculated by the program is such that each processor's partition ends on a page boundary, thus ensuring that all data from these structures that are needed by a processor can be allocated to its local memory */ /* POSSIBLE ENHANCEMENT: Here is where one might distribute the x, trans, and umain2 data structures across physically distributed memories as desired. One way to place data is as follows: double *base; int i; i = ((N/P)+(rootN/P)*pad_length)*2; base = &(x[0]); for (j=0;jstart[mon_dum1].count[mon_dum2] = 0; { cyg_semaphore_init(&Global->start[mon_dum1].queue[mon_dum2],0);}; } for (mon_dum1=0; mon_dum1 < 1; mon_dum1++) { cyg_spinlock_init( &Global->start[mon_dum1].lock,0 );; } /*---------------------------------------*/ }}; cyg_spinlock_init( &Global->idlock,0 );; Global->id = 0; InitX(N, x); /* place random values in x */ if (test_result) { ck1 = CheckSum(N, x); } if (doprint) { printf("Original data values:\n"); PrintArray(N, x); } InitU(N,umain); /* initialize u arrays*/ InitU2(N,umain2,rootN); /* fire off P processes */ for (i=1; itranstimes[0]; printf("\n"); printf(" PROCESS STATISTICS\n"); printf(" Computation Transpose Transpose\n"); printf(" Proc Time Time Fraction\n"); printf(" 0 %10d %10d %8.5f\n", Global->totaltimes[0],Global->transtimes[0], ((double)Global->transtimes[0])/Global->totaltimes[0]); if (dostats) { transtime2 = Global->transtimes[0]; avgtranstime = Global->transtimes[0]; avgcomptime = Global->totaltimes[0]; maxtotal = Global->totaltimes[0]; mintotal = Global->totaltimes[0]; maxfrac = ((double)Global->transtimes[0])/Global->totaltimes[0]; minfrac = ((double)Global->transtimes[0])/Global->totaltimes[0]; avgfractime = ((double)Global->transtimes[0])/Global->totaltimes[0]; for (i=1;itranstimes[i] > transtime) { transtime = Global->transtimes[i]; } if (Global->transtimes[i] < transtime2) { transtime2 = Global->transtimes[i]; } if (Global->totaltimes[i] > maxtotal) { maxtotal = Global->totaltimes[i]; } if (Global->totaltimes[i] < mintotal) { mintotal = Global->totaltimes[i]; } if (((double)Global->transtimes[i])/Global->totaltimes[i] > maxfrac) { maxfrac = ((double)Global->transtimes[i])/Global->totaltimes[i]; } if (((double)Global->transtimes[i])/Global->totaltimes[i] < minfrac) { minfrac = ((double)Global->transtimes[i])/Global->totaltimes[i]; } printf(" %3d %10d %10d %f\n", i,Global->totaltimes[i],Global->transtimes[i], ((double)Global->transtimes[i])/Global->totaltimes[i]); avgtranstime += Global->transtimes[i]; avgcomptime += Global->totaltimes[i]; avgfractime += ((double)Global->transtimes[i])/Global->totaltimes[i]; } printf(" Avg %10f %10f %f\n", ((double) avgcomptime)/P,((double) avgtranstime)/P,avgfractime/P); printf(" Max %10d %10d %f\n", maxtotal,transtime,maxfrac); printf(" Min %10d %10d %f\n", mintotal,transtime2,minfrac); } Global->starttime = start; printf("\n"); printf(" TIMING INFORMATION\n"); printf("Start time : %16d\n", Global->starttime); printf("Initialization finish time : %16d\n", Global->initdonetime); printf("Overall finish time : %16d\n", Global->finishtime); printf("Total time with initialization : %16d\n", Global->finishtime-Global->starttime); printf("Total time without initialization : %16d\n", Global->finishtime-Global->initdonetime); printf("Overall transpose time : %16d\n", transtime); printf("Overall transpose fraction : %f\n", ((double) transtime)/(Global->finishtime-Global->initdonetime)); printf("\n"); if (test_result) { ck3 = CheckSum(N, x); printf(" INVERSE FFT TEST RESULTS\n"); printf("Checksum difference is %f (%f, %f)\n", ck1-ck3, ck1, ck3); if (fabs(ck1-ck3) < 0.001) { printf("TEST PASSED\n"); } else { printf("TEST FAILED\n"); } } { diag_printf("FP test end\n"); return 0; } ; } void SlaveStart() { int i; int j; int MyNum; double error; double *upriv; int initdone; int finish; int l_transtime=0; int MyFirst; int MyLast; cyg_spinlock_spin(&Global->idlock);; MyNum = Global->id; Global->id++; #ifdef FFT_DBG printf("Slave %d started\n",MyNum); #endif cyg_spinlock_clear(&Global->idlock);; /* POSSIBLE ENHANCEMENT: Here is where one might pin processes to processors to avoid migration */ { /*---------------------------------*/ cyg_spinlock_spin(&( Global->start[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; #ifdef FFT_DBG cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 1\n",MyNum); cyg_spinlock_clear(&Global->idlock);; #endif upriv = (double *) malloc(2*(rootN-1)*sizeof(double)); if (upriv == NULL) { printf("Proc %d could not malloc memory for upriv\n",MyNum); exit(-1); } for (i=0;i<2*(rootN-1);i++) { upriv[i] = umain[i]; } MyFirst = rootN*MyNum/P; MyLast = rootN*(MyNum+1)/P; TouchArray(x, trans, umain2, upriv, N, MyNum, MyFirst, MyLast); { /*---------------------------------*/ cyg_spinlock_spin(&( Global->start[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; #ifdef FFT_DBG cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 2\n",MyNum); cyg_spinlock_clear(&Global->idlock);; #endif /* POSSIBLE ENHANCEMENT: Here is where one might reset the statistics that one is measuring about the parallel execution */ if ((MyNum == 0) || (dostats)) { { (initdone) = cyg_current_time()*10; } ; } /* perform forward FFT */ FFT1D(1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst, MyLast, pad_length, P, test_result, doprint, dostats, Global); /* perform backward FFT */ if (test_result) { FFT1D(-1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst, MyLast, pad_length, P, test_result, doprint, dostats, Global); } if ((MyNum == 0) || (dostats)) { { (finish) = cyg_current_time()*10; } ; Global->transtimes[MyNum] = l_transtime; Global->totaltimes[MyNum] = finish-initdone; } if (MyNum == 0) { Global->finishtime = finish; Global->initdonetime = initdone; } thread_array[MyNum] = 1; } double TouchArray(x, scratch, u, upriv, N, MyNum, MyFirst, MyLast) double *x; double *scratch; double *u; double *upriv; int N; int MyNum; int MyFirst; int MyLast; { int i,j,k; double tot = 0.0; /* touch my data */ for (j=0;j<2*(rootN-1);j++) { tot += upriv[j]; } for (j=MyFirst; j rootN-1) { return; } u[2*(base+j)] = cos(2.0*PI*j/(2*n1)); u[2*(base+j)+1] = -sin(2.0*PI*j/(2*n1)); } } } InitU2(N, u, n1) int N; double *u; int n1; { int i,j,k; int base; for (j=0; j>1; } return(j); } void FFT1D(direction, M, N, x, scratch, upriv, umain2, MyNum, l_transtime, MyFirst, MyLast, pad_length, P, test_result, doprint, dostats, Global) int direction; int M; int N; int *l_transtime; double *x; double *upriv; double *scratch; double *umain2; int MyFirst; int MyLast; int pad_length; int P; int test_result; int doprint; int dostats; struct GlobalMemory *Global; { int i; int j; int m1; int n1; int flag = 0; unsigned int clocktime1; unsigned int clocktime2; m1 = M/2; n1 = 1<start[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; #ifdef FFT_DBG cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 3\n",MyNum); cyg_spinlock_clear(&Global->idlock);; #endif if ((MyNum == 0) || (dostats)) { { (clocktime1) = cyg_current_time()*10; } ; } /* transpose from x into scratch */ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length); if ((MyNum == 0) || (dostats)) { { (clocktime2) = cyg_current_time()*10; } ; *l_transtime += (clocktime2-clocktime1); } /* do n1 1D FFTs on columns */ for (j=MyFirst; jstart[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; #ifdef FFT_DBG cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 4\n",MyNum); cyg_spinlock_clear(&Global->idlock);; #endif if ((MyNum == 0) || (dostats)) { { (clocktime1) = cyg_current_time()*10; } ; } /* transpose */ Transpose(n1, scratch, x, MyNum, MyFirst, MyLast, pad_length); if ((MyNum == 0) || (dostats)) { { (clocktime2) = cyg_current_time()*10; } ; *l_transtime += (clocktime2-clocktime1); } /* do n1 1D FFTs on columns again */ for (j=MyFirst; jstart[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; #ifdef FFT_DBG cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 5\n",MyNum); cyg_spinlock_clear(&Global->idlock);; #endif if ((MyNum == 0) || (dostats)) { { (clocktime1) = cyg_current_time()*10; } ; } /* transpose back */ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length); if ((MyNum == 0) || (dostats)) { { (clocktime2) = cyg_current_time()*10; } ; *l_transtime += (clocktime2-clocktime1); } { /*---------------------------------*/ cyg_spinlock_spin(&( Global->start[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; #ifdef FFT_DBG cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 6\n",MyNum); cyg_spinlock_clear(&Global->idlock);; #endif /* copy columns from scratch to x */ if ((test_result) || (doprint)) { for (j=MyFirst; jstart[0].lock ) );; if (dbg_on) { diag_printf(" thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] < (P -1) ) { Global->start[0].count[0]++; if (dbg_on) { diag_printf(" thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; { cyg_semaphore_wait(&Global->start[0].queue[0] );}; } if (dbg_on) { diag_printf(" thread %d: start[0].count[0],Global->start[0].queue[0].count); } if ( Global->start[0].count[0] == 0 ) { if (dbg_on) { diag_printf(" thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count); } cyg_spinlock_clear(&Global->start[0].lock );; } else { ( Global->start[0].count[0] )-- ; //cyg_spinlock_clear(&Global->start[0].count_lock[0] );; if (dbg_on) { diag_printf(" thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count); } { cyg_semaphore_post(&Global->start[0].queue[0] );}; } /*---------------------------------*/ }; } TwiddleOneCol(direction, n1, N, j, u, x, pad_length) int direction; int n1; int N; int j; double *u; double *x; int pad_length; { int i; double omega_r; double omega_c; double x_r; double x_c; double r1; double c1; double r2; double c2; for (i=0; i k) { SWAP(x[2*j], x[2*k]); SWAP(x[2*j+1], x[2*k+1]); } } } FFT1DOnce(direction, M, N, u, x) int direction; int M; int N; double *u; double *x; { int j; int k; int q; int L; int r; int Lstar; double *u1; double *x1; double *x2; double omega_r; double omega_c; double tau_r; double tau_c; double x_r; double x_c; Reverse(N, M, x); for (q=1; q<=M; q++) { L = 1<