summaryrefslogtreecommitdiff
path: root/cesar/ecos/packages/kernel/current/tests_smp/fft.C
diff options
context:
space:
mode:
authorsave2008-04-07 14:17:42 +0000
committersave2008-04-07 14:17:42 +0000
commit3d58a62727346b7ac1a6cb36fed1a06ed72228dd (patch)
treed7788c3cf9f76426aef0286d0202e2097f0fa0eb /cesar/ecos/packages/kernel/current/tests_smp/fft.C
parent095dca4b0a8d4924093bab424f71f588fdd84613 (diff)
Moved the complete svn base into the cesar directory.
git-svn-id: svn+ssh://pessac/svn/cesar/trunk@1769 017c9cb6-072f-447c-8318-d5b54f68fe89
Diffstat (limited to 'cesar/ecos/packages/kernel/current/tests_smp/fft.C')
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/fft.C1057
1 files changed, 1057 insertions, 0 deletions
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/fft.C b/cesar/ecos/packages/kernel/current/tests_smp/fft.C
new file mode 100644
index 0000000000..e2b0f979af
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/fft.C
@@ -0,0 +1,1057 @@
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Perform 1D fast Fourier transform using six-step FFT method */
+/* */
+/* 1) Performs staggered, blocked transposes for cache-line reuse */
+/* 2) Roots of unity rearranged and distributed for only local */
+/* accesses during application of roots of unity */
+/* 3) Small set of roots of unity elements replicated locally for */
+/* 1D FFTs (less than root N elements replicated at each node) */
+/* 4) Matrix data structures are padded to reduce cache mapping */
+/* conflicts */
+/* */
+/* Command line options: */
+/* */
+/* -mM : M = even integer; 2**M total complex data points transformed. */
+/* -pP : P = number of processors; Must be a power of 2. */
+/* -nN : N = number of cache lines. */
+/* -lL : L = Log base 2 of cache line length in bytes. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Perform FFT and inverse FFT. Test output by comparing the */
+/* integral of the original data to the integral of the data */
+/* that results from performing the FFT and inverse FFT. */
+/* -o : Print out complex data points. */
+/* -h : Print out command line options. */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <tests_smp/getopt.h>
+
+#define MAXRAND 2147483647
+#define PAGE_SIZE 4096
+#define NUM_CACHE_LINES 65536
+#define LOG2_LINE_SIZE 4
+#define PI 3.1416
+#define DEFAULT_M 10
+#define DEFAULT_P 1
+
+#include <tests_smp/fft_arg.c>
+
+MAIN_ENV
+
+#define SWAP(a,b) {double tmp; tmp=a; a=b; b=tmp;}
+
+struct GlobalMemory {
+ int id;
+ LOCKDEC(idlock)
+ BARDEC(start)
+ int *transtimes;
+ int *totaltimes;
+ int starttime;
+ int finishtime;
+ int initdonetime;
+} *Global;
+
+int dbg_on = 0;
+
+int P = DEFAULT_P;
+int M = DEFAULT_M;
+int N; /* N = 2^M */
+int rootN; /* rootN = N^1/2 */
+double *x; /* x is the original time-domain data */
+double *trans; /* trans is used as scratch space */
+double *umain; /* umain is roots of unity for 1D FFTs */
+double *umain2; /* umain2 is entire roots of unity matrix */
+int test_result = 0;
+int doprint = 0;
+int dostats = 0;
+int transtime = 0;
+int transtime2 = 0;
+int avgtranstime = 0;
+int avgcomptime = 0;
+unsigned int transstart = 0;
+unsigned int transend = 0;
+int maxtotal=0;
+int mintotal=0;
+double maxfrac=0;
+double minfrac=0;
+double avgfractime=0;
+int orig_num_lines = NUM_CACHE_LINES; /* number of cache lines */
+int num_cache_lines = NUM_CACHE_LINES; /* number of cache lines */
+int log2_line_size = LOG2_LINE_SIZE;
+int line_size;
+int rowsperproc;
+double ck1;
+double ck3; /* checksums for testing answer */
+int pad_length;
+
+void SlaveStart();
+double TouchArray(double *,double *,double *,double *,int,int,int,int);
+void FFT1D(int,int,int,double *,double *,double *,double *,int,int *,int,
+ int,int,int,int,int,int,struct GlobalMemory *);
+double CheckSum();
+double drand48();
+int log_2(int);
+void printerr(char *);
+volatile int thread_array[4] = {0, 0, 0, 0};
+
+int smp_cyg_test_main(int argc, char **argv)
+{
+ int i;
+ int j;
+ int c;
+ extern char *optarg;
+ int m1;
+ int factor;
+ int pages;
+ unsigned int start;
+ //memset (thread_array,0,sizeof(thread_array));
+
+ CLOCK(start);
+
+
+// extern Cyg_Mempool_dlmalloc *cygmem_memalloc_heaps[ 2 ];
+// cygmem_memalloc_heaps[0].
+
+
+ // printf("<cpu%d> thread %d Arguments (%d): ",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),argc);
+// for (i = 0;i < argc;i++){
+// printf("%s ",argv[i]);
+// }
+// printf("\n");
+
+ while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) {
+ switch(c) {
+ case 'p': P = atoi(optarg);
+ P = HAL_SMP_CPU_MAX;
+ if (P < 1) {
+ printerr("P must be >= 1\n");
+ exit(-1);
+ }
+ if (log_2(P) == -1) {
+ printerr("P must be a power of 2\n");
+ exit(-1);
+ }
+ break;
+ case 'm': M = atoi(optarg);
+ m1 = M/2;
+ if (2*m1 != M) {
+ printerr("M must be even\n");
+ exit(-1);
+ }
+ break;
+ case 'n': num_cache_lines = atoi(optarg);
+ orig_num_lines = num_cache_lines;
+ if (num_cache_lines < 1) {
+ printerr("Number of cache lines must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 'l': log2_line_size = atoi(optarg);
+ if (log2_line_size < 0) {
+ printerr("Log base 2 of cache line length in bytes must be >= 0\n");
+ exit(-1);
+ }
+ break;
+ case 's': dostats = !dostats;
+ break;
+ case 't': test_result = !test_result;
+ break;
+ case 'o': doprint = !doprint;
+ break;
+ case 'h': printf("Usage: FFT <options>\n\n");
+ printf("options:\n");
+ printf(" -mM : M = even integer; 2**M total complex data points transformed.\n");
+ printf(" -pP : P = number of processors; Must be a power of 2.\n");
+ printf(" -nN : N = number of cache lines.\n");
+ printf(" -lL : L = Log base 2 of cache line length in bytes.\n");
+ printf(" -s : Print individual processor timing statistics.\n");
+ printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n");
+ printf(" integral of the original data to the integral of the data that\n");
+ printf(" results from performing the FFT and inverse FFT.\n");
+ printf(" -o : Print out complex data points.\n");
+ printf(" -h : Print out command line options.\n\n");
+ printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n",
+ DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE);
+ exit(0);
+ break;
+ }
+ }
+
+ if (P > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ MAIN_INITENV(,80000000);
+
+ N = 1<<M;
+ rootN = 1<<(M/2);
+ rowsperproc = rootN/P;
+ if (rowsperproc == 0) {
+ printerr("Matrix not large enough. 2**(M/2) must be >= P\n");
+ exit(-1);
+ }
+
+ line_size = 1 << log2_line_size;
+ if (line_size < 2*sizeof(double)) {
+ printf("WARNING: Each element is a complex double (%d bytes)\n",2*sizeof(double));
+ printf(" => Less than one element per cache line\n");
+ printf(" Computing transpose blocking factor\n");
+ factor = (2*sizeof(double)) / line_size;
+ num_cache_lines = orig_num_lines / factor;
+ }
+ if (line_size <= 2*sizeof(double)) {
+ pad_length = 1;
+ } else {
+ pad_length = line_size / (2*sizeof(double));
+ }
+
+ if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) {
+ pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE;
+ if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) {
+ pages ++;
+ }
+ pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc);
+ } else {
+ pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) /
+
+ (2 * sizeof(double) * rowsperproc);
+ if (pad_length * (2 * sizeof(double) * rowsperproc) !=
+ (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) {
+ printerr("Padding algorithm unsuccessful\n");
+ exit(-1);
+ }
+ }
+
+ Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory));
+ if (Global == NULL) {
+ printf("Could not malloc memory for Global (%d)\n",sizeof(struct GlobalMemory));
+ exit(-1);
+ }
+
+
+ x = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);
+ trans = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);
+ umain = (double *) G_MALLOC(2*rootN*sizeof(double));
+ umain2 = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);
+
+
+ Global->transtimes = (int *) G_MALLOC(P*sizeof(int));
+ Global->totaltimes = (int *) G_MALLOC(P*sizeof(int));
+ if (x == NULL) {
+ printerr("Could not malloc memory for x\n");
+ exit(-1);
+ } else if (trans == NULL) {
+ printerr("Could not malloc memory for trans\n");
+ exit(-1);
+ } else if (umain == NULL) {
+ printerr("Could not malloc memory for umain\n");
+ exit(-1);
+ } else if (umain2 == NULL) {
+ printerr("Could not malloc memory for umain2\n");
+ exit(-1);
+ }
+
+ x = (double *) (((unsigned) x) + PAGE_SIZE - ((unsigned) x) % PAGE_SIZE);
+ trans = (double *) (((unsigned) trans) + PAGE_SIZE - ((unsigned) trans) % PAGE_SIZE);
+ umain2 = (double *) (((unsigned) umain2) + PAGE_SIZE - ((unsigned) umain2) % PAGE_SIZE);
+
+/* In order to optimize data distribution, the data structures x, trans,
+ and umain2 have been aligned so that each begins on a page boundary.
+ This ensures that the amount of padding calculated by the program is
+ such that each processor's partition ends on a page boundary, thus
+ ensuring that all data from these structures that are needed by a
+ processor can be allocated to its local memory */
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the x,
+ trans, and umain2 data structures across physically distributed
+ memories as desired.
+
+ One way to place data is as follows:
+
+ double *base;
+ int i;
+
+ i = ((N/P)+(rootN/P)*pad_length)*2;
+ base = &(x[0]);
+ for (j=0;j<P;j++) {
+ Place all addresses x such that (base <= x < base+i) on node j
+ base += i;
+ }
+
+ The trans and umain2 data structures can be placed in a similar manner.
+
+ */
+
+ printf("\n");
+ printf("FFT with Blocking Transpose\n");
+ printf(" %d Complex Doubles\n",N);
+ printf(" %d Threads\n",P);
+ if (num_cache_lines != orig_num_lines) {
+ printf(" %d Cache lines\n",orig_num_lines);
+ printf(" %d Cache lines for blocking transpose\n",num_cache_lines);
+ } else {
+ printf(" %d Cache lines\n",num_cache_lines);
+ }
+ printf(" %d Byte line size\n",(1 << log2_line_size));
+ printf(" %d Bytes per page\n",PAGE_SIZE);
+ printf("\n");
+
+ BARINIT(Global->start);
+ LOCKINIT(Global->idlock);
+ Global->id = 0;
+ InitX(N, x); /* place random values in x */
+
+ if (test_result) {
+ ck1 = CheckSum(N, x);
+ }
+ if (doprint) {
+ printf("Original data values:\n");
+ PrintArray(N, x);
+ }
+
+ InitU(N,umain); /* initialize u arrays*/
+ InitU2(N,umain2,rootN);
+
+ /* fire off P processes */
+ for (i=1; i<P; i++) {
+ CREATE(SlaveStart,thread_array[i]);
+ }
+
+ SlaveStart();
+
+ for (i=1; i<P; i++) {
+ while(!thread_array[i]) {};
+ }
+
+ if (doprint) {
+ if (test_result) {
+ printf("Data values after inverse FFT:\n");
+ } else {
+ printf("Data values after FFT:\n");
+ }
+ PrintArray(N, x);
+ }
+
+ transtime = Global->transtimes[0];
+ printf("\n");
+ printf(" PROCESS STATISTICS\n");
+ printf(" Computation Transpose Transpose\n");
+ printf(" Proc Time Time Fraction\n");
+ printf(" 0 %10d %10d %8.5f\n",
+ Global->totaltimes[0],Global->transtimes[0],
+ ((double)Global->transtimes[0])/Global->totaltimes[0]);
+ if (dostats) {
+ transtime2 = Global->transtimes[0];
+ avgtranstime = Global->transtimes[0];
+ avgcomptime = Global->totaltimes[0];
+ maxtotal = Global->totaltimes[0];
+ mintotal = Global->totaltimes[0];
+ maxfrac = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ minfrac = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ avgfractime = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ for (i=1;i<P;i++) {
+ if (Global->transtimes[i] > transtime) {
+ transtime = Global->transtimes[i];
+ }
+ if (Global->transtimes[i] < transtime2) {
+ transtime2 = Global->transtimes[i];
+ }
+ if (Global->totaltimes[i] > maxtotal) {
+ maxtotal = Global->totaltimes[i];
+ }
+ if (Global->totaltimes[i] < mintotal) {
+ mintotal = Global->totaltimes[i];
+ }
+ if (((double)Global->transtimes[i])/Global->totaltimes[i] > maxfrac) {
+ maxfrac = ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ if (((double)Global->transtimes[i])/Global->totaltimes[i] < minfrac) {
+ minfrac = ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ printf(" %3d %10d %10d %f\n",
+ i,Global->totaltimes[i],Global->transtimes[i],
+ ((double)Global->transtimes[i])/Global->totaltimes[i]);
+ avgtranstime += Global->transtimes[i];
+ avgcomptime += Global->totaltimes[i];
+ avgfractime += ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ printf(" Avg %10f %10f %f\n",
+ ((double) avgcomptime)/P,((double) avgtranstime)/P,avgfractime/P);
+ printf(" Max %10d %10d %f\n",
+ maxtotal,transtime,maxfrac);
+ printf(" Min %10d %10d %f\n",
+ mintotal,transtime2,minfrac);
+ }
+ Global->starttime = start;
+ printf("\n");
+ printf(" TIMING INFORMATION\n");
+ printf("Start time : %16d\n",
+ Global->starttime);
+ printf("Initialization finish time : %16d\n",
+ Global->initdonetime);
+ printf("Overall finish time : %16d\n",
+ Global->finishtime);
+ printf("Total time with initialization : %16d\n",
+ Global->finishtime-Global->starttime);
+ printf("Total time without initialization : %16d\n",
+ Global->finishtime-Global->initdonetime);
+ printf("Overall transpose time : %16d\n",
+ transtime);
+ printf("Overall transpose fraction : %f\n",
+ ((double) transtime)/(Global->finishtime-Global->initdonetime));
+ printf("\n");
+
+ if (test_result) {
+ ck3 = CheckSum(N, x);
+ printf(" INVERSE FFT TEST RESULTS\n");
+ printf("Checksum difference is %f (%f, %f)\n",
+ ck1-ck3, ck1, ck3);
+ if (fabs(ck1-ck3) < 0.001) {
+ printf("TEST PASSED\n");
+ } else {
+ printf("TEST FAILED\n");
+ }
+ }
+
+ MAIN_END;
+}
+
+
+void SlaveStart()
+{
+ int i;
+ int j;
+ int MyNum;
+ double error;
+ double *upriv;
+ int initdone;
+ int finish;
+ int l_transtime=0;
+ int MyFirst;
+ int MyLast;
+
+ LOCK(Global->idlock);
+ MyNum = Global->id;
+ Global->id++;
+#ifdef FFT_DBG
+ printf("Slave %d started\n",MyNum);
+#endif
+ UNLOCK(Global->idlock);
+
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ BARRIER(Global->start, P, 1);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 1\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ upriv = (double *) malloc(2*(rootN-1)*sizeof(double));
+ if (upriv == NULL) {
+ printf("Proc %d could not malloc memory for upriv\n",MyNum);
+ exit(-1);
+ }
+ for (i=0;i<2*(rootN-1);i++) {
+ upriv[i] = umain[i];
+ }
+
+ MyFirst = rootN*MyNum/P;
+ MyLast = rootN*(MyNum+1)/P;
+
+ TouchArray(x, trans, umain2, upriv, N, MyNum, MyFirst, MyLast);
+
+ BARRIER(Global->start, P, 2);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 2\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(initdone);
+ }
+
+ /* perform forward FFT */
+ FFT1D(1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst,
+ MyLast, pad_length, P, test_result, doprint, dostats, Global);
+
+ /* perform backward FFT */
+ if (test_result) {
+ FFT1D(-1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst,
+ MyLast, pad_length, P, test_result, doprint, dostats, Global);
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(finish);
+ Global->transtimes[MyNum] = l_transtime;
+ Global->totaltimes[MyNum] = finish-initdone;
+ }
+ if (MyNum == 0) {
+ Global->finishtime = finish;
+ Global->initdonetime = initdone;
+ }
+
+ thread_array[MyNum] = 1;
+}
+
+
+double TouchArray(x, scratch, u, upriv, N, MyNum, MyFirst, MyLast)
+
+double *x;
+double *scratch;
+double *u;
+double *upriv;
+int N;
+int MyNum;
+int MyFirst;
+int MyLast;
+
+{
+ int i,j,k;
+ double tot = 0.0;
+
+ /* touch my data */
+ for (j=0;j<2*(rootN-1);j++) {
+ tot += upriv[j];
+ }
+ for (j=MyFirst; j<MyLast; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ tot += x[2*(k+i)] + x[2*(k+i)+1] +
+ scratch[2*(k+i)] + scratch[2*(k+i)+1] +
+ u[2*(k+i)] + u[2*(k+i)+1];
+ }
+ }
+ return tot;
+}
+
+
+double CheckSum(N, x)
+
+int N;
+double *x;
+
+{
+ int i,j,k;
+ double cks;
+
+ cks = 0.0;
+ for (j=0; j<rootN; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ cks += x[2*(k+i)] + x[2*(k+i)+1];
+ }
+ }
+
+ return(cks);
+}
+
+
+InitX(N, x)
+
+int N;
+double *x;
+
+{
+ int i,j,k;
+
+ srand(0);
+ for (j=0; j<rootN; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ x[2*(k+i)] = rand()/MAXRAND;
+ x[2*(k+i)+1] = rand()/MAXRAND;
+ }
+ }
+}
+
+
+InitU(N, u)
+
+int N;
+double *u;
+
+{
+ int q;
+ int j;
+ int base;
+ int n1;
+
+ for (q=0; 1<<q<N; q++) {
+ n1 = 1<<q;
+ base = n1-1;
+ for (j=0; j<n1; j++) {
+ if (base+j > rootN-1) {
+ return;
+ }
+ u[2*(base+j)] = cos(2.0*PI*j/(2*n1));
+ u[2*(base+j)+1] = -sin(2.0*PI*j/(2*n1));
+ }
+ }
+}
+
+
+InitU2(N, u, n1)
+
+int N;
+double *u;
+int n1;
+
+{
+ int i,j,k;
+ int base;
+
+ for (j=0; j<n1; j++) {
+ k = j*(rootN+pad_length);
+ for (i=0; i<n1; i++) {
+ u[2*(k+i)] = cos(2.0*PI*i*j/(N));
+ u[2*(k+i)+1] = -sin(2.0*PI*i*j/(N));
+ }
+ }
+}
+
+
+BitReverse(M, k)
+
+int M;
+int k;
+
+{
+ int i;
+ int j;
+ int tmp;
+
+ j = 0;
+ tmp = k;
+ for (i=0; i<M; i++) {
+ j = 2*j + (tmp&0x1);
+ tmp = tmp>>1;
+ }
+ return(j);
+}
+
+
+void FFT1D(direction, M, N, x, scratch, upriv, umain2, MyNum, l_transtime,
+ MyFirst, MyLast, pad_length, P, test_result, doprint, dostats,
+ Global)
+
+int direction;
+int M;
+int N;
+int *l_transtime;
+double *x;
+double *upriv;
+double *scratch;
+double *umain2;
+int MyFirst;
+int MyLast;
+int pad_length;
+int P;
+int test_result;
+int doprint;
+int dostats;
+struct GlobalMemory *Global;
+
+{
+ int i;
+ int j;
+ int m1;
+ int n1;
+ int flag = 0;
+ unsigned int clocktime1;
+ unsigned int clocktime2;
+
+ m1 = M/2;
+ n1 = 1<<m1;
+
+ BARRIER(Global->start, P, 3);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 3\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime1);
+ }
+
+ /* transpose from x into scratch */
+ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime2);
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ /* do n1 1D FFTs on columns */
+ for (j=MyFirst; j<MyLast; j++) {
+ FFT1DOnce(direction, m1, n1, upriv, &scratch[2*j*(n1+pad_length)]);
+ TwiddleOneCol(direction, n1, N, j, umain2, &scratch[2*j*(n1+pad_length)],
+ pad_length);
+ }
+
+ BARRIER(Global->start, P, 4);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 4\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime1);
+ }
+ /* transpose */
+ Transpose(n1, scratch, x, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime2);
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ /* do n1 1D FFTs on columns again */
+ for (j=MyFirst; j<MyLast; j++) {
+ FFT1DOnce(direction, m1, n1, upriv, &x[2*j*(n1+pad_length)]);
+ if (direction == -1)
+ Scale(n1, N, &x[2*j*(n1+pad_length)]);
+ }
+
+ BARRIER(Global->start, P, 5);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 5\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime1);
+ }
+
+ /* transpose back */
+ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime2);
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ BARRIER(Global->start, P, 6);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 6\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ /* copy columns from scratch to x */
+ if ((test_result) || (doprint)) {
+ for (j=MyFirst; j<MyLast; j++) {
+ CopyColumn(n1, &scratch[2*j*(n1+pad_length)], &x[2*j*(n1+pad_length)]);
+ }
+ }
+
+ BARRIER(Global->start, P, 7);
+}
+
+
+TwiddleOneCol(direction, n1, N, j, u, x, pad_length)
+
+int direction;
+int n1;
+int N;
+int j;
+double *u;
+double *x;
+int pad_length;
+
+{
+ int i;
+ double omega_r;
+ double omega_c;
+ double x_r;
+ double x_c;
+ double r1;
+ double c1;
+ double r2;
+ double c2;
+
+ for (i=0; i<n1; i++) {
+ omega_r = u[2*(j*(n1+pad_length)+i)];
+ omega_c = direction*u[2*(j*(n1+pad_length)+i)+1];
+ x_r = x[2*i];
+ x_c = x[2*i+1];
+ x[2*i] = omega_r*x_r - omega_c*x_c;
+ x[2*i+1] = omega_r*x_c + omega_c*x_r;
+ }
+}
+
+
+Scale(n1, N, x)
+
+int n1;
+int N;
+double *x;
+
+{
+ int i;
+
+ for (i=0; i<n1; i++) {
+ x[2*i] /= N;
+ x[2*i+1] /= N;
+ }
+}
+
+
+Transpose(n1, src, dest, MyNum, MyFirst, MyLast, pad_length)
+
+int n1;
+double *src;
+double *dest;
+int MyNum;
+int MyFirst;
+int MyLast;
+int pad_length;
+
+{
+ int i;
+ int j;
+ int k;
+ int l;
+ int m;
+ int blksize;
+ int numblks;
+ int firstfirst;
+ int h_off;
+ int v_off;
+ int v;
+ int h;
+ int n1p;
+ int row_count;
+
+ blksize = MyLast-MyFirst;
+ numblks = (2*blksize)/num_cache_lines;
+ if (numblks * num_cache_lines != 2 * blksize) {
+ numblks ++;
+ }
+ blksize = blksize / numblks;
+ firstfirst = MyFirst;
+ row_count = n1/P;
+ n1p = n1+pad_length;
+ for (l=MyNum+1;l<P;l++) {
+ v_off = l*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+ }
+
+ for (l=0;l<MyNum;l++) {
+ v_off = l*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+ }
+
+ v_off = MyNum*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+}
+
+
+CopyColumn(n1, src, dest)
+
+int n1;
+double *src;
+double *dest;
+
+{
+ int i;
+
+ for (i=0; i<n1; i++) {
+ dest[2*i] = src[2*i];
+ dest[2*i+1] = src[2*i+1];
+ }
+}
+
+
+Reverse(N, M, x)
+
+int N;
+int M;
+double *x;
+
+{
+ int j, k;
+
+ for (k=0; k<N; k++) {
+ j = BitReverse(M, k);
+ if (j > k) {
+ SWAP(x[2*j], x[2*k]);
+ SWAP(x[2*j+1], x[2*k+1]);
+ }
+ }
+}
+
+
+FFT1DOnce(direction, M, N, u, x)
+
+int direction;
+int M;
+int N;
+double *u;
+double *x;
+
+{
+ int j;
+ int k;
+ int q;
+ int L;
+ int r;
+ int Lstar;
+ double *u1;
+ double *x1;
+ double *x2;
+ double omega_r;
+ double omega_c;
+ double tau_r;
+ double tau_c;
+ double x_r;
+ double x_c;
+
+ Reverse(N, M, x);
+
+ for (q=1; q<=M; q++) {
+ L = 1<<q; r = N/L; Lstar = L/2;
+ u1 = &u[2*(Lstar-1)];
+ for (k=0; k<r; k++) {
+ x1 = &x[2*(k*L)];
+ x2 = &x[2*(k*L+Lstar)];
+ for (j=0; j<Lstar; j++) {
+ omega_r = u1[2*j];
+ omega_c = direction*u1[2*j+1];
+ x_r = x2[2*j];
+ x_c = x2[2*j+1];
+ tau_r = omega_r*x_r - omega_c*x_c;
+ tau_c = omega_r*x_c + omega_c*x_r;
+ x_r = x1[2*j];
+ x_c = x1[2*j+1];
+ x2[2*j] = x_r - tau_r;
+ x2[2*j+1] = x_c - tau_c;
+ x1[2*j] = x_r + tau_r;
+ x1[2*j+1] = x_c + tau_c;
+ }
+ }
+ }
+}
+
+
+PrintArray(N, x)
+
+int N;
+double *x;
+
+{
+ int i, j, k;
+
+ for (i=0; i<rootN; i++) {
+ k = i*(rootN+pad_length);
+ for (j=0; j<rootN; j++) {
+ printf(" %4.2f %4.2f", x[2*(k+j)], x[2*(k+j)+1]);
+ if (i*rootN+j != N-1) {
+ printf(",");
+ }
+ if ((i*rootN+j+1) % 8 == 0) {
+ printf("\n");
+ }
+ }
+ }
+ printf("\n");
+ printf("\n");
+}
+
+
+void printerr(s)
+
+char *s;
+
+{
+ printf("ERROR: %s\n",s);
+}
+
+
+int log_2(number)
+
+int number;
+
+{
+ int cumulative = 1;
+ int out = 0;
+ int done = 0;
+
+ while ((cumulative < number) && (!done) && (out < 50)) {
+ if (cumulative == number) {
+ done = 1;
+ } else {
+ cumulative = cumulative * 2;
+ out ++;
+ }
+ }
+
+ if (cumulative == number) {
+ return(out);
+ } else {
+ return(-1);
+ }
+}
+
+#include <tests_smp/getopt.c>