summaryrefslogtreecommitdiff
path: root/cesar/ecos/packages/kernel/current/tests_smp
diff options
context:
space:
mode:
Diffstat (limited to 'cesar/ecos/packages/kernel/current/tests_smp')
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/README.SPLASH2349
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/README.fft62
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/README.lu55
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/README.radix44
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/SPLASH2.POSTING124
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/c.m4.ecos290
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/fft.C1057
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/fft.c1460
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/fft_arg.c18
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/getopt.c654
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/getopt.h129
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/lu.C788
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/lu.c1118
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/lu_arg.c12
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/makefile13
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/radix.C909
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/radix.c1346
-rw-r--r--cesar/ecos/packages/kernel/current/tests_smp/radix_arg.c12
18 files changed, 8440 insertions, 0 deletions
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/README.SPLASH2 b/cesar/ecos/packages/kernel/current/tests_smp/README.SPLASH2
new file mode 100644
index 0000000000..7fee72c9f0
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/README.SPLASH2
@@ -0,0 +1,349 @@
+Date: Oct 19, 1994
+
+This is the directory for the second release of the Stanford Parallel
+Applications for Shared-Memory (SPLASH-2) programs. For further
+information contact splash@mojave.stanford.edu.
+
+PLEASE NOTE: Due to our limited resources, we will be unable to spend
+much time answering questions about the applications.
+
+splash.tar contains the tared version of all the files. Grabbing this
+file will get you everything you need. We also keep the files
+individually untared for partial retrieval. The splash.tar file is not
+compressed, but the large files in it are. We attempted to compress the
+splash.tar file to reduce the file size further, but this resulted in
+a negative compression ratio.
+
+
+DIFFERENCES BETWEEN SPLASH AND SPLASH-2:
+----------------------------------------
+
+The SPLASH-2 suite contains two types of codes: full applications and
+kernels. Each of the codes utilizes the Argonne National Laboratories
+(ANL) parmacs macros for parallel constructs. Unlike the codes in the
+original SPLASH release, each of the codes assumes the use of a
+"lightweight threads" model (which we hereafter refer to as the "threads"
+model) in which child processes share the same virtual address space as
+their parent process. In order for the codes to function correctly,
+the CREATE macro should call the proper Unix system routine (e.g. "sproc"
+in the Silicon Graphics IRIX operating system) instead of the "fork"
+routine that was used for SPLASH. The difference is that processes
+created with the Unix fork command receive their own private copies of
+all global variables. In the threads model, child processes share the
+same virtual address space, and hence all global data. Some of the
+codes function correctly when the Unix "fork" command is used for child
+process creation as well. Comments in the code header denote those
+applications which function correctly with "fork."
+
+
+MACROS:
+-------
+
+Macros for the previous release of the SPLASH application suite can be
+obtained via anonymous ftp to www-flash.stanford.edu. The macros are
+contained in the pub/old_splash/splash/macros subdirectory. HOWEVER,
+THE MACRO FILES MUST BE MODIFIED IN ORDER TO BE USED WITH SPLASH-2 CODES.
+The CREATE macros must be changed so that they call the proper process
+creation routine (See DIFFERENCES section above) instead of "fork."
+
+In this macros subdirectory, macros and sample makefiles are provided
+for three machines:
+
+Encore Multimax (CMU Mach 2.5: C and Fortran)
+SGI 4D/240 (IRIX System V Release 3.3: C only)
+Alliant FX/8 (Alliant Rev. 5.0: C and Fortran)
+
+These macros work for us with the above operating systems. Unfortunately,
+our limited resources prevent us from supporting them in any way or
+even fielding questions about them. If they don't work for you, please
+contact Argonne National Labs for a version that will. An e-mail address
+to try might be monitor-users-request@mcs.anl.gov. An excerpt from
+a message, received from Argonne, concerning obtaining the macros follows:
+
+"The parmacs package is in the public domain. Approximately 15 people at
+Argonne (or associated with Argonne or students) have worked on the
+parmacs package at one time or another. The parmacs package is
+implemented via macros using the M4 macropreprocessor (standard on most
+Unix systems). Current distribution of the software is somewhat ad hoc.
+Most C versions can be obtained from netlib (send electronic mail to
+netlib@ornl.gov with the message send index from parmacs). Fortran
+versions have been emailed directly or sent on tape. The primary
+documentation for the parmacs package is the book ``Portable Programs for
+Parallel Processors'' by Lusk, et al, Holt, Rinehart, and Winston 1987."
+
+The makefiles provided in the individual program directories specify
+a null macro set that will turn the parallel programs into sequential
+ones. Note that we do not have a null macro set for FORTRAN.
+
+
+CODE ENHANCEMENTS:
+------------------
+
+All of the codes are designed for shared address space multiprocessors
+with physically distributed main memory. For these types of machines,
+process migration and poor data distribution can decrease performance
+to suboptimal levels. In the applications, comments indicating potential
+enhancements can be found which will improve performance. Each potential
+enhancement is denoted by a comment beginning with "POSSIBLE ENHANCEMENT".
+The potential enhancements which we identify are:
+
+ (1) Data Distribution
+
+ Comments are placed in the code indicating where directives should
+ be placed so that data can be migrated to the local memories of
+ nodes, thus allowing for remote communication to be minimized.
+
+ (2) Process-to-Processor Assignment
+
+ Comments are placed in the code indicating where directives should
+ be placed so that processes can be "pinned" to processors,
+ preventing them from migrating from processor to processor.
+
+In addition, to facilitate simulation studies, we note points in the
+codes where statistics gathering routines should be turned on so that
+cold-start and initialization effects can be avoided.
+
+As previously mentioned, processes are assumed to be created through calls
+to a "threads" model creation routine. One important side effect is that
+this model causes all global variables to be shared (whereas the fork model
+causes all processes to get their own private copy of global variables).
+In order to mimic the behavior of global variables in the fork model, many
+of the applications provide arrays of structures that can be accessed by
+process ID, such as:
+
+ struct per_process_info {
+ char pad[PAD_LENGTH];
+ unsigned start_time;
+ unsigned end_time;
+ char pad[PAD_LENGTH];
+ } PPI[MAX_PROCS];
+
+In these structures, padding is inserted to ensure that the structure
+information associated with each process can be placed on a different
+page of memory, and can thus be explicitly migrated to that processor's
+local memory system. We follow this strategy for certain variables since
+these data really belong to a process and should be allocated in its local
+memory. A programming model that had the ability to declare global private
+data would have automatically ensured that these data were private, and
+that false sharing did not occur across different structures in the
+array. However, since the threads model does not provide this capability,
+it is provided by explicitly introducing arrays of structures with padding.
+The padding constants used in the programs (PAD_LENGTH in this example)
+can easily be changed to suit the particular characteristics of a given
+system. The actual data that is manipulated by individual applications
+(e.g. grid points, particle data, etc) is not padded, however.
+
+Finally, for some applications we provide less-optimized versions of the
+codes. The less-optimized versions utilize data structures that lead to
+simpler implementations, but which do not allow for optimal data
+distribution (and can thus generate false-sharing).
+
+
+REPORT:
+-------
+
+A report will be put together shortly describing the structure, function,
+and performance characteristics of each application. The report will be
+similar to the original SPLASH report (see the original report for the
+issues discussed). The report will provide quantitative data (for two
+different cache line size) for characteristics such as working set size
+and miss rates (local versus remote, etc.). In addition, the report
+will discuss cache behavior and synchronization behavior of the
+applications as well. In the mean time, each application directory has
+a README file that describes how to run each application. In addition,
+most applications have comments in their headers describing how to run
+each application.
+
+
+README FILES:
+-------------
+
+Each application has an associated README file. It is VERY important to
+read these files carefully, as they discuss the important parameters to
+supply for each application, as well as other issues involved in running
+the programs. In each README file, we discuss the impact of explicitly
+distributing data on the Stanford DASH Multiprocessor. Unless otherwise
+specified, we assume that the default data distribution mechanism is
+through round-robin page allocation.
+
+
+PROBLEM SIZES:
+--------------
+
+For each application, the README file describes a recommended problem
+size that is a reasonable base problem size that both can be simulated
+and is not too small for reality on a machine with up to 64 processors.
+For the purposes of studying algorithm performance, the parameters
+associated with each application can be varied. However, for the
+purposes of comparing machine architectures, the README files describe
+which parameters can be varied, and which should remain constant (or at
+their default values) for comparability. If the specific "base"
+parameters that are specified are not used, then results which are
+reported should explicitly state which parameters were changed, what
+their new values are, and address why they were changed.
+
+
+CORE PROGRAMS:
+--------------
+
+Since the number of programs has increased over SPLASH, and since not
+everyone may be able to use all the programs in a given study, we
+identify some of the programs as "core" programs that should be used
+in most studies for comparability. In the currently available set, these
+core programs include:
+
+(1) Ocean Simulation
+(2) Hierarchical Radiosity
+(3) Water Simulation with Spatial data structure
+(4) Barnes-Hut
+(5) FFT
+(6) Blocked Sparse Cholesky Factorization
+(7) Radix Sort
+
+The less optimized versions of the programs, when provided, should be
+used only in addition to these.
+
+
+MAILING LIST:
+-------------
+
+Please send a note to splash@mojave.stanford.edu if you have copied over
+the programs, so that we can put you on a mailing list for update reports.
+
+
+AUTHORSHIP:
+-----------
+
+The applications provided in the SPLASH-2 suite were developed by a number
+of people. The report lists authors primarily responsible for the
+development of each application code. The codes were made ready for
+distribution and the README files were prepared by Steven Cameron Woo and
+Jaswinder Pal Singh.
+
+
+CODE CHANGES:
+-------------
+
+If modifications are made to the codes which improve their performance,
+we would like to hear about them. Please send email to
+splash@mojave.stanford.edu detailing the changes.
+
+
+UPDATE REPORTS:
+---------------
+
+Watch this file for information regarding changes to codes and additions
+to the application suite.
+
+
+CHANGES:
+-------
+
+10-21-94: Ocean code, contiguous partitions, line 247 of slave1.C changed
+ from
+
+ t2a[0][0] = hh3*t2a[0][0]+hh1*psi[procid][1][0][0];
+
+ to
+
+ t2a[0][0] = hh3*t2a[0][0]+hh1*t2c[0][0];
+
+ This change does not affect correctness; it is an optimization
+ that was performed elsewhere in the code but overlooked here.
+
+11-01-94: Barnes, file code_io.C, line 55 changed from
+
+ in_real(instr, tnow);
+
+ to
+
+ in_real(instr, &tnow);
+
+11-01-94: Raytrace, file main.C, lines 216-223 changed from
+
+ if ((pid == 0) || (dostats))
+ CLOCK(end);
+
+ gm->partime[0] = (end - begin) & 0x7FFFFFFF;
+ if (pid == 0) gm->par_start_time = begin;
+
+/* printf("Process %ld elapsed time %lu.\n", pid, lapsed); */
+
+ }
+
+ to
+
+ if ((pid == 0) || (dostats)) {
+ CLOCK(end);
+ gm->partime[pid] = (end - begin) & 0x7FFFFFFF;
+ if (pid == 0) gm->par_start_time = begin;
+ }
+
+11-13-94: Raytrace, file memory.C
+
+ The use of the word MAIN_INITENV in a comment in memory.c causes
+ m4 to expand this macro, and some implementations may get confused
+ and generate the wrong C code.
+
+11-13-94: Radiosity, file rad_main.C
+
+ rad_main.C uses the macro CREATE_LITE. All three instances of
+ CREATE_LITE should be changed to CREATE.
+
+11-13-94: Water-spatial and Water-nsquared, file makefile
+
+ makefiles were changed so that the compilation phases included the
+ CFLAGS options instead of the CCOPTS options, which did not exist.
+
+11-17-94: FMM, file particle.C
+
+ Comment regarding data distribution of particle_array data
+ structure is incorrect. Round-robin allocation should be used.
+
+11-18-94: OCEAN, contiguous partitions, files main.C and linkup.C
+
+ Eliminated a problem which caused non-doubleword aligned
+ accesses to doublewords for the uniprocessor case.
+
+ main.C: Added lines 467-471:
+
+ if (nprocs%2 == 1) { /* To make sure that the actual data
+ starts double word aligned, add an extra
+ pointer */
+ d_size += sizeof(double ***);
+ }
+
+ Added same lines in file linkup.C at line numbers 100 and 159.
+
+07-30-95: RADIX has been changed. A tree-structured parallel prefix
+ computation is now used instead of a linear one.
+
+ LU had been modified. A comment describing how to distribute
+ data (one of the POSSIBLE ENHANCEMENTS) was incorrect for the
+ contiguous_blocks version of LU. Also, a modification was made
+ that reduces false sharing at line 206 of lu.C:
+
+ last_malloc[i] = (double *) (((unsigned) last_malloc[i]) + PAGE_SIZE -
+ ((unsigned) last_malloc[i]) % PAGE_SIZE);
+
+ A subdirectory shmem_files was added under the codes directory.
+ This directory contains a file that can be compiled on SGI machines
+ which replaces the libsgi.a file distributed in the original SPLASH
+ release.
+
+09-26-95: Fixed a bug in LU. Line 201 was changed from
+
+ last_malloc[i] = (double *) G_MALLOC(proc_bytes[i])
+
+ to
+
+ last_malloc[i] = (double *) G_MALLOC(proc_bytes[i] + PAGE_SIZE)
+
+ Fixed similar bugs in WATER-NSQUARED and WATER-SPATIAL. Both
+ codes needed a barrier added into the mdmain.C files. In both
+ codes, the line
+
+ BARRIER(gl->start, NumProcs);
+
+ was added. In WATER-NSQUARED, it was added in mdmain.C at line
+ 84. In WATER-SPATIAL, it was added in mdmain.C at line 107.
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/README.fft b/cesar/ecos/packages/kernel/current/tests_smp/README.fft
new file mode 100644
index 0000000000..3de85e9239
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/README.fft
@@ -0,0 +1,62 @@
+GENERAL INFORMATION:
+
+The FFT program is a complex, one-dimensional version of the "Six-Step"
+FFT described in
+
+Bailey, D. H. FFTs in External or Hierarchical Memory.
+ Journal of Supercomputing, 4(1):23-35, March 1990.
+
+Specific optimizations in this implementation include: (1) Performing
+staggered, blocked transposes that exploit cache-line reuse; (2) The
+roots of unity data structure is arranged and distributed for only local
+accesses during application of the roots of unity step; (3) A small set of
+roots of unity elements are replicated locally at each processor for
+computation of the 1D FFTs; and (4) The matrix data structures are padded
+to reduce cache mapping conflicts. The algorithm used in this
+implementation is described in:
+
+Woo, S. C., Singh, J. P., and Hennessy, J. L. The Performance Advantages
+ of Integrating Block Data Transfer in Cache-Coherent Multiprocessors.
+ Proceedings of the 6th International Conference on Architectural
+ Support for Programming Languages and Operating Systems (ASPLOS-VI),
+ October 1994.
+
+This program works under both the Unix FORK and SPROC models.
+
+RUNNING THE PROGRAM:
+
+To see how to run the program, please see the comment at the top of the
+file fft.C, or run the application with the "-h" command line option.
+Four command-line parameters MUST be specified: The number of points
+to transform, the number of processors, the log base 2 of the cache
+line size, and the number of cache lines.
+
+The number of complex data points to be transformed and the number of
+processors being used can be varied according to the following rules.
+The number of data points must be an even power of 2 (2**16, 2**18,
+etc). The command-line option "-mM" specifies the even power of two (M),
+so for 65,536 data points, the command line option is "-m16." The number
+of processors must be a power of 2.
+
+The main data structures are padded to reduce cache mapping conflicts.
+The constant PAGE_SIZE should be changed to reflect the page size (in
+bytes) of the target system. In addition, the program uses a blocking
+technique to exploit cache line reuse. Both the log base 2 of the cache
+line size and the number of cache lines in the cache should be specified
+at the command line in order to allow the blocking algorithm to work
+effectively.
+
+BASE PROBLEM SIZE:
+
+The base problem size for an upto-64 processor machine is 65,536 complex
+data points (M=16). The PAGE_SIZE constant should be changed to reflect
+the page size (in bytes) of the target system. In addition, the log base
+2 of the cache line size, the number of cache lines, and the number of
+processors must be specified.
+
+DATA DISTRIBUTION:
+
+Our "POSSIBLE ENHANCEMENT" comments in the source code tell where one
+might want to distribute data and how. Data distribution has an impact
+on performance on the Stanford DASH multiprocessor.
+
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/README.lu b/cesar/ecos/packages/kernel/current/tests_smp/README.lu
new file mode 100644
index 0000000000..cce3108fb6
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/README.lu
@@ -0,0 +1,55 @@
+GENERAL INFORMATION:
+
+The LU program factors a dense matrix into the product of a lower
+triangular and an upper triangular matrix. The factorization uses
+blocking to exploit temporal locality on individual submatrix elements.
+The algorithm used in this implementation is described in
+
+Woo, S. C., Singh, J. P., and Hennessy, J. L. The Performance Advantages
+ of Integrating Block Data Transfer in Cache-Coherent Multiprocessors.
+ Proceedings of the 6th International Conference on Architectural
+ Support for Programming Languages and Operating Systems (ASPLOS-VI),
+ October 1994.
+
+Two implementations are provided in the SPLASH-2 distribution:
+
+ (1) Non-contiguous block allocation
+
+ This implementation (contained in the non_contiguous_blocks
+ subdirectory) implements the matrix to be factored with a
+ two-dimensional array. This data structure prevents blocks from
+ being allocated contiguously, but leads to a conceptually simple
+ programming implementation.
+
+ (2) Contiguous block allocation
+
+ This implementation (contained in the contiguous_blocks
+ subdirectory) implements the matrix to be factored as an array
+ of blocks. This data structure allows blocks to be allocated
+ contiguously and entirely in the local memory of processors that
+ "own" them, thus enhancing data locality properties.
+
+These programs work under both the Unix FORK and SPROC models.
+
+RUNNING THE PROGRAM:
+
+To see how to run the program, please see the comment at the top of the
+file lu.C, or run the application with the "-h" command line option.
+Three parameters may be specified on the command line, of which the
+ones that are normally changed are the matrix size and the number of
+processors. It is suggested that the block size be kept at the value
+B=16, since this value works well in practice. If this parameter is
+changed, the new value should be reported in any results that are
+presented.
+
+BASE PROBLEM SIZE:
+
+The base problem size for an upto-64 processor machine is a 512x512 matrix
+with a block size of B=16.
+
+DATA DISTRIBUTION:
+
+Our "POSSIBLE ENHANCEMENT" comments in the source code tell where one
+might want to distribute data and how. Data distribution has a small
+impact on performance on the Stanford DASH multiprocessor.
+
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/README.radix b/cesar/ecos/packages/kernel/current/tests_smp/README.radix
new file mode 100644
index 0000000000..3cd5b8026f
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/README.radix
@@ -0,0 +1,44 @@
+GENERAL INFORMATION:
+
+The RADIX program implements an integer radix sort based on the method
+described in:
+
+Blelloch, G. E., et. al. A Comparison of Sorting Algorithms for the
+ Connection Machine CM-2. In Symposium on Parallel Algorithms and
+ Architectures, pp. 3-16, July 1991.
+
+A description of this implementation can be found in:
+
+Woo, S. C., Singh, J. P., and Hennessy, J. L. The Performance Advantages
+ of Integrating Message Passing in Cache-Coherent Multiprocessors.
+ Technical Report CSL-TR-93-593, Stanford University, December 1993.
+
+This program works under both the Unix FORK and SPROC models.
+
+RUNNING THE PROGRAM:
+
+To see how to run the program, please see the comment at the top of the
+file radix.C, or run the application with the "-h" command line option.
+Four command line parameters can be specified, of which the ones which
+would normally be changed are the number of keys to sort, the radix
+for sorting, and the number of processors. The radix used for sorting
+must be a power of 2. Optional command line parameters allow timing
+information to be printed out at the end of the program, testing to
+make sure all keys are sorted correctly, and keys to be printed out
+in sorted order.
+
+BASE PROBLEM SIZE:
+
+The base problem size for an upto-64 processor machine is 256k (262,144)
+keys to be sorted and a radix of 1024. The default values should be
+used for other parameters (except the number of processors, which can
+be varied). For larger problems, the number of keys can also be
+increased by factors of 2. Any changes to these parameters should be
+reported when results are presented.
+
+DATA DISTRIBUTION:
+
+Our "POSSIBLE ENHANCEMENT" comments in the source code tell where one
+might want to distribute data and how. Data distribution has an impact
+on performance on the Stanford DASH multiprocessor.
+
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/SPLASH2.POSTING b/cesar/ecos/packages/kernel/current/tests_smp/SPLASH2.POSTING
new file mode 100644
index 0000000000..29dc8aed27
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/SPLASH2.POSTING
@@ -0,0 +1,124 @@
+We are pleased to announce the release of the SPLASH-2 suite of
+multiprocessor applications. SPLASH-2 is the successor to the SPLASH
+suite that we previously released, and the programs in it are also
+written assuming a coherent shared address space communication model.
+SPLASH-2 contains several new applications, as well as improved versions
+of applications from SPLASH. The suite is currently available via
+anonymous ftp to
+
+ www-flash.stanford.edu (in the pub/splash2 subdirectory)
+
+and via the World-Wide-Web at
+
+ http://www-flash.stanford.edu/apps/SPLASH/
+
+Several programs are currently available, and a few others will be added
+shortly. The programs fall into two categories: full applications and
+kernels. Additionally, we designate some of these as "core programs"
+(see below). The applications and kernels currently available in the
+SPLASH-2 suite include:
+
+Applications:
+ Ocean Simulation
+ Ray Tracer
+ Hierarchical Radiosity
+ Volume Renderer
+ Water Simulation with Spatial Data Structure
+ Water Simulation without Spatial Data Structure
+ Barnes-Hut (gravitational N-body simulation)
+ Adaptive Fast Multipole (gravitational N-body simulation)
+
+Kernels:
+ FFT
+ Blocked LU Decomposition
+ Blocked Sparse Cholesky Factorization
+ Radix Sort
+
+Programs that will appear soon include:
+
+ PSIM4 - Particle Dynamics Simulation (full application)
+ Conjugate Gradient (kernel)
+ LocusRoute (standard cell router from SPLASH)
+ Protein Structure Prediction
+ Protein Sequencing
+ Parallel Probabilistic Inference
+
+In some cases, we provide both well-optimized and less-optimized versions
+of the programs. For both the Ocean simulation and the Blocked LU
+Decomposition kernel, less optimized versions of the codes are currently
+available.
+
+There are important differences between applications in the SPLASH-2 suite
+and applications in the SPLASH suite. These differences are noted in the
+README.SPLASH2 file in the pub/splash2 directory. It is *VERY IMPORTANT*
+that you read the README.SPLASH2 file, as well as the individual README
+files in the program directories, before using the SPLASH-2 programs.
+These files describe how to run the programs, provide commented annotations
+about how to distribute data on a machine with physically distributed main
+memory, and provides guidelines on the baseline problem sizes to use when
+studying architectural interactions through simulation.
+
+Complete documentation of SPLASH2, including a detailed characterization
+of performance as well as memory system interactions and synchronization
+behavior, will appear in the SPLASH2 report that is currently being
+written.
+
+
+OPTIMIZATION STRATEGY:
+----------------------
+
+For each application and kernel, we note potential features or
+enhancements that are typically machine-specific. These potential
+enhancements are encapsulated within comments in the code starting with
+the string "POSSIBLE ENHANCEMENT." The potential enhancements which we
+identify are:
+
+ (1) Data Distribution
+
+ We note where data migration routines should be called in order to
+ enhance locality of data access. We do not distribute data by
+ default as different machines implement migration routines in
+ different ways, and on some machines this is not relevant.
+
+ (2) Process-to-Processor Assignment
+
+ We note where calls can be made to "pin" processes to specific
+ processors so that process migration can be avoided. We do not
+ do this by default, since different machines implement this
+ feature in different ways.
+
+In addition, to facilitate simulation studies, we note points in the
+codes where statistics gathering routines should be turned on so that
+cold-start and initialization effects can be avoided.
+
+For two programs (Ocean and LU), we provide less-optimized versions of
+the codes. The less-optimized versions utilize data structures that
+lead to simpler implementations, but which do not allow for optimal data
+distribution (and can generate false-sharing).
+
+
+CORE PROGRAMS:
+--------------
+
+Since the number of programs has increased over SPLASH, and since not
+everyone may be able to use all the programs in a given study, we
+identify some of the programs as "core" programs that should be used
+in most studies for comparability. In the currently available set,
+these core programs include:
+
+(1) Ocean Simulation
+(2) Hierarchical Radiosity
+(3) Water Simulation with Spatial data structure
+(4) Barnes-Hut
+(5) FFT
+(6) Blocked Sparse Cholesky Factorization
+(7) Radix Sort
+
+The less optimized versions of the programs, when available, should be
+used only in addition to these.
+
+The base problem sizes that we recommend are provided in the README files
+for individual applications. Please use at least these for experiments
+with upto 64 processors. If changes are made to these base parameters
+for further experimentation, these changes should be explicitly stated
+in any results that are presented.
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/c.m4.ecos b/cesar/ecos/packages/kernel/current/tests_smp/c.m4.ecos
new file mode 100644
index 0000000000..6b94263c89
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/c.m4.ecos
@@ -0,0 +1,290 @@
+divert(-1)
+define(NEWPROC,) dnl
+define(ENDLAB, 5283) dnl
+
+define(BARRIER, `{
+/*---------------------------------*/
+ LOCK( ( $1[0].lock ) );
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),$3,$1[0].count[0],$1[0].queue[0].count);
+ }
+ if ( $1[0].count[0] < ($2 -1) ) {
+ $1[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),$3,$1[0].count[0],$1[0].queue[0].count);
+ }
+ UNLOCK( $1[0].lock );
+ SEMWAIT( $1[0].queue[0] );
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),$3,$1[0].count[0],$1[0].queue[0].count);
+ }
+ if ( $1[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),$3,$1[0].count[0],$1[0].queue[0].count);
+ }
+
+ UNLOCK( $1[0].lock );
+
+ } else {
+ ( $1[0].count[0] )-- ;
+ //UNLOCK( $1[0].count_lock[0] );
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),$3,$1[0].count[0],$1[0].queue[0].count);
+ }
+ SEMPOST( $1[0].queue[0] );
+ }
+
+/*---------------------------------*/
+}')
+
+
+define(BARDEC, `DECVAR($1,1,1,)')
+define(BARINIT, `{MONINIT($1,1,1)}')
+
+define(GSDEC, `/*##########*/
+int ($1);')
+define(GSINIT, `/*##########*/
+{ ($1) = 0; }')
+define(GETSUB, `/*##########*/
+{
+ if (($1)<=($3))
+ ($2) = ($1)++;
+ else {
+ ($2) = -1;
+ ($1) = 0;
+ }
+}')
+
+define(NU_GSDEC, `int ($1);')
+define(NU_GSINIT, `{ ($1) = 0; }')
+define(NU_GETSUB, `GETSUB($1,$2,$3,$4)')
+
+define(ADEC, `int ($1);')
+define(AINIT, `{/*##########*/;}')
+define(PROBEND, `{/*##########*/;}')
+
+define(LOCKDEC, `cyg_spinlock_t $1;')
+define(LOCKINIT, `cyg_spinlock_init( &$1,0 );')
+define(LOCK, `cyg_spinlock_spin(&$1);')
+define(UNLOCK, `cyg_spinlock_clear(&$1);')
+
+define(SEMINIT, `{ cyg_semaphore_init(&$1,$2);}')
+define(SEMWAIT, `{ cyg_semaphore_wait(&$1);}')
+define(SEMPOST, `{ cyg_semaphore_post(&$1);}')
+define(SEMDEC, ` cyg_sem_t $1;')
+
+define(NLOCKDEC, `int ($1);')
+define(NLOCKINIT, `{/*##########*/;}')
+define(NLOCK, `{/*##########*/;}')
+define(NUNLOCK, `{/*##########*/;}')
+
+define(ALOCKDEC, `DECVAR($1,0,$2)')
+define(ALOCKINIT, `{ MONINIT($1,0,$2);}')
+define(ALOCK, `{ MENTER($1,$2);}')
+define(AULOCK, `{ MEXIT($1,$2);}')
+
+
+define(PAUSEDEC, ` ')
+define(PAUSEINIT, `{/*##########*/;}')
+define(CLEARPAUSE, `{/*##########*/;}')
+define(SETPAUSE, `{/*##########*/;}')
+define(EVENT, `{/*##########*/;}')
+define(WAITPAUSE, `{/*##########*/;}')
+define(PAUSE, `{/*##########*/;}')
+
+define(AUG_ON, ` ')
+define(AUG_OFF, ` ')
+define(TRACE_ON, ` ')
+define(TRACE_OFF, ` ')
+define(REF_TRACE_ON, ` ')
+define(REF_TRACE_OFF, ` ')
+define(DYN_TRACE_ON, `;')
+define(DYN_TRACE_OFF, `;')
+define(DYN_REF_TRACE_ON, `;')
+define(DYN_REF_TRACE_OFF, `;')
+define(DYN_SIM_ON, `;')
+define(DYN_SIM_OFF, `;')
+define(DYN_SCHED_ON, `;')
+define(DYN_SCHED_OFF, `;')
+define(AUG_SET_LOLIMIT, `;')
+define(AUG_SET_HILIMIT, `;')
+
+define(MENTER, `LOCK($1[$2].lock);')
+define(DELAY, `{$1[$3].count[$2]++;
+ UNLOCK($1[$3].lock)
+ SEMWAIT($1[$3].queue[$2]);}')
+define(CONTINUE, `{
+ if ($1[$3].count[$2]) {
+ ($1[$3].count[$2])--;
+ SEMPOST($1[$3].queue[$2])
+ } else
+ UNLOCK($1[$3].lock)
+ goto `L'ENDLAB;
+`LGO'ENDLAB: ;
+}')
+define(MEXIT, `{
+ UNLOCK($1[$2].lock); `L'ENDLAB: ;
+ define(`ENDLAB', eval(ENDLAB+1))
+}')
+
+define(WAIT_FOR_END, `{
+;}')
+
+/* create(<procedure>)*/
+define(CREATE,`
+{
+ /*---------------------------------------*/
+ stack[i] = calloc(STACK_SIZE, 1);
+ cyg_thread_create(10, // Priority - just a number
+ $1, // entry
+ 0, // index
+ "slave", // Name
+ stack[i], // Stack
+ STACK_SIZE, // Size
+ &threadh[i], // Handle
+ &thread[i] // Thread data structure
+ );
+ cyg_thread_resume( threadh[i] );
+ /*---------------------------------------*/
+}
+')
+
+define(MAIN_INITENV, `{;}')
+define(MAIN_END, `
+{
+ diag_printf("FP test end\n");
+ return 0;
+}
+')
+define(MAIN_ENV,`
+/*---------------------------------------*/
+#include <pkgconf/system.h>
+#include <pkgconf/infra.h>
+#include <pkgconf/kernel.h>
+#include <cyg/infra/cyg_type.h>
+#include <cyg/infra/testcase.h>
+#include <pkgconf/isoinfra.h>
+#include <cyg/hal/hal_cache.h>
+#include <cyg/hal/hal_smp.h>
+#include <cyg/kernel/kapi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+int smp_cyg_test_main(int argc, char **argv);
+
+void smp_cyg_test_main_call(void *p) {
+ smp_cyg_test_main(smp_cyg_test_argc, smp_cyg_test_argv);
+}
+
+
+/* #define STACK_SIZE 0x1b000 */
+#define STACK_SIZE 8192
+static cyg_thread smp_cyg_test_thread;
+static char smp_cyg_test_stack[STACK_SIZE];
+static cyg_handle_t smp_cyg_test_threadh;
+
+
+
+cyg_handle_t threadh[64];
+char *stack[64];
+cyg_thread thread[64];
+
+
+externC void cyg_user_start( void )
+{
+ CYG_TEST_INIT();
+
+ diag_printf("Starting test app\n");
+
+ cyg_thread_create(10, // Priority - just a number
+ smp_cyg_test_main_call, // entry
+ 0, // index
+ "smp test", // Name
+ smp_cyg_test_stack, // Stack
+ STACK_SIZE, // Size
+ &smp_cyg_test_threadh, // Handle
+ &smp_cyg_test_thread // Thread data structure
+ );
+ cyg_thread_resume( smp_cyg_test_threadh );
+ /*cyg_scheduler_start();*/
+}
+
+
+/*---------------------------------------*/
+')
+
+define(ENV, ` ')
+define(EXTERN_ENV, `
+
+#include <pkgconf/system.h>
+#include <pkgconf/infra.h>
+#include <pkgconf/kernel.h>
+#include <cyg/infra/cyg_type.h>
+#include <cyg/infra/testcase.h>
+#include <pkgconf/isoinfra.h>
+#include <cyg/hal/hal_cache.h>
+#include <cyg/kernel/kapi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+ ')
+
+define(G_MALLOC, `calloc($1, 1);')
+define(G_FREE, `;')
+define(G_MALLOC_F, `malloc($1);')
+define(NU_MALLOC, `malloc($1);')
+define(NU_FREE, `;')
+define(NU_MALLOC_F, `malloc($1)')
+
+define(GET_HOME, `{($1) = 0;}')
+define(GET_PID, `{($1) = 0;}')
+define(AUG_DELAY, `{sleep ($1);}')
+define(ST_LOG, `{;}')
+define(SET_HOME, `{;}')
+
+/* clock(<clock_val>)*/
+define(CLOCK, `{
+ ($1) = cyg_current_time()*10;
+}
+')
+divert(0)
+
+
+
+define(DECVAR,`
+/*---------------------------------------*/
+struct $1TYP {
+ LOCKDEC(lock);
+ ifelse(eval($2 > 0),1,int count[$2];,)
+ ifelse(eval($2 > 0),1, SEMDEC(queue[$2]);,)
+ $4
+ } $1[$3];
+/*---------------------------------------*/
+ '
+)
+
+define(MONINIT, `{
+/*---------------------------------------*/
+ int mon_dum1,mon_dum2;
+ ifelse(eval($2 > 0),1,
+ for (mon_dum1=0; mon_dum1 < $3; mon_dum1++)
+ for (mon_dum2=0; mon_dum2 < $2; mon_dum2++) {
+ $1[mon_dum1].count[mon_dum2] = 0;
+ SEMINIT($1[mon_dum1].queue[mon_dum2],0);
+ },,)
+ for (mon_dum1=0; mon_dum1 < $3; mon_dum1++) {
+ LOCKINIT($1[mon_dum1].lock);
+ }
+/*---------------------------------------*/
+}')
+
+
+
+
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/fft.C b/cesar/ecos/packages/kernel/current/tests_smp/fft.C
new file mode 100644
index 0000000000..e2b0f979af
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/fft.C
@@ -0,0 +1,1057 @@
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Perform 1D fast Fourier transform using six-step FFT method */
+/* */
+/* 1) Performs staggered, blocked transposes for cache-line reuse */
+/* 2) Roots of unity rearranged and distributed for only local */
+/* accesses during application of roots of unity */
+/* 3) Small set of roots of unity elements replicated locally for */
+/* 1D FFTs (less than root N elements replicated at each node) */
+/* 4) Matrix data structures are padded to reduce cache mapping */
+/* conflicts */
+/* */
+/* Command line options: */
+/* */
+/* -mM : M = even integer; 2**M total complex data points transformed. */
+/* -pP : P = number of processors; Must be a power of 2. */
+/* -nN : N = number of cache lines. */
+/* -lL : L = Log base 2 of cache line length in bytes. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Perform FFT and inverse FFT. Test output by comparing the */
+/* integral of the original data to the integral of the data */
+/* that results from performing the FFT and inverse FFT. */
+/* -o : Print out complex data points. */
+/* -h : Print out command line options. */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <tests_smp/getopt.h>
+
+#define MAXRAND 2147483647
+#define PAGE_SIZE 4096
+#define NUM_CACHE_LINES 65536
+#define LOG2_LINE_SIZE 4
+#define PI 3.1416
+#define DEFAULT_M 10
+#define DEFAULT_P 1
+
+#include <tests_smp/fft_arg.c>
+
+MAIN_ENV
+
+#define SWAP(a,b) {double tmp; tmp=a; a=b; b=tmp;}
+
+struct GlobalMemory {
+ int id;
+ LOCKDEC(idlock)
+ BARDEC(start)
+ int *transtimes;
+ int *totaltimes;
+ int starttime;
+ int finishtime;
+ int initdonetime;
+} *Global;
+
+int dbg_on = 0;
+
+int P = DEFAULT_P;
+int M = DEFAULT_M;
+int N; /* N = 2^M */
+int rootN; /* rootN = N^1/2 */
+double *x; /* x is the original time-domain data */
+double *trans; /* trans is used as scratch space */
+double *umain; /* umain is roots of unity for 1D FFTs */
+double *umain2; /* umain2 is entire roots of unity matrix */
+int test_result = 0;
+int doprint = 0;
+int dostats = 0;
+int transtime = 0;
+int transtime2 = 0;
+int avgtranstime = 0;
+int avgcomptime = 0;
+unsigned int transstart = 0;
+unsigned int transend = 0;
+int maxtotal=0;
+int mintotal=0;
+double maxfrac=0;
+double minfrac=0;
+double avgfractime=0;
+int orig_num_lines = NUM_CACHE_LINES; /* number of cache lines */
+int num_cache_lines = NUM_CACHE_LINES; /* number of cache lines */
+int log2_line_size = LOG2_LINE_SIZE;
+int line_size;
+int rowsperproc;
+double ck1;
+double ck3; /* checksums for testing answer */
+int pad_length;
+
+void SlaveStart();
+double TouchArray(double *,double *,double *,double *,int,int,int,int);
+void FFT1D(int,int,int,double *,double *,double *,double *,int,int *,int,
+ int,int,int,int,int,int,struct GlobalMemory *);
+double CheckSum();
+double drand48();
+int log_2(int);
+void printerr(char *);
+volatile int thread_array[4] = {0, 0, 0, 0};
+
+int smp_cyg_test_main(int argc, char **argv)
+{
+ int i;
+ int j;
+ int c;
+ extern char *optarg;
+ int m1;
+ int factor;
+ int pages;
+ unsigned int start;
+ //memset (thread_array,0,sizeof(thread_array));
+
+ CLOCK(start);
+
+
+// extern Cyg_Mempool_dlmalloc *cygmem_memalloc_heaps[ 2 ];
+// cygmem_memalloc_heaps[0].
+
+
+ // printf("<cpu%d> thread %d Arguments (%d): ",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),argc);
+// for (i = 0;i < argc;i++){
+// printf("%s ",argv[i]);
+// }
+// printf("\n");
+
+ while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) {
+ switch(c) {
+ case 'p': P = atoi(optarg);
+ P = HAL_SMP_CPU_MAX;
+ if (P < 1) {
+ printerr("P must be >= 1\n");
+ exit(-1);
+ }
+ if (log_2(P) == -1) {
+ printerr("P must be a power of 2\n");
+ exit(-1);
+ }
+ break;
+ case 'm': M = atoi(optarg);
+ m1 = M/2;
+ if (2*m1 != M) {
+ printerr("M must be even\n");
+ exit(-1);
+ }
+ break;
+ case 'n': num_cache_lines = atoi(optarg);
+ orig_num_lines = num_cache_lines;
+ if (num_cache_lines < 1) {
+ printerr("Number of cache lines must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 'l': log2_line_size = atoi(optarg);
+ if (log2_line_size < 0) {
+ printerr("Log base 2 of cache line length in bytes must be >= 0\n");
+ exit(-1);
+ }
+ break;
+ case 's': dostats = !dostats;
+ break;
+ case 't': test_result = !test_result;
+ break;
+ case 'o': doprint = !doprint;
+ break;
+ case 'h': printf("Usage: FFT <options>\n\n");
+ printf("options:\n");
+ printf(" -mM : M = even integer; 2**M total complex data points transformed.\n");
+ printf(" -pP : P = number of processors; Must be a power of 2.\n");
+ printf(" -nN : N = number of cache lines.\n");
+ printf(" -lL : L = Log base 2 of cache line length in bytes.\n");
+ printf(" -s : Print individual processor timing statistics.\n");
+ printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n");
+ printf(" integral of the original data to the integral of the data that\n");
+ printf(" results from performing the FFT and inverse FFT.\n");
+ printf(" -o : Print out complex data points.\n");
+ printf(" -h : Print out command line options.\n\n");
+ printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n",
+ DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE);
+ exit(0);
+ break;
+ }
+ }
+
+ if (P > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ MAIN_INITENV(,80000000);
+
+ N = 1<<M;
+ rootN = 1<<(M/2);
+ rowsperproc = rootN/P;
+ if (rowsperproc == 0) {
+ printerr("Matrix not large enough. 2**(M/2) must be >= P\n");
+ exit(-1);
+ }
+
+ line_size = 1 << log2_line_size;
+ if (line_size < 2*sizeof(double)) {
+ printf("WARNING: Each element is a complex double (%d bytes)\n",2*sizeof(double));
+ printf(" => Less than one element per cache line\n");
+ printf(" Computing transpose blocking factor\n");
+ factor = (2*sizeof(double)) / line_size;
+ num_cache_lines = orig_num_lines / factor;
+ }
+ if (line_size <= 2*sizeof(double)) {
+ pad_length = 1;
+ } else {
+ pad_length = line_size / (2*sizeof(double));
+ }
+
+ if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) {
+ pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE;
+ if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) {
+ pages ++;
+ }
+ pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc);
+ } else {
+ pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) /
+
+ (2 * sizeof(double) * rowsperproc);
+ if (pad_length * (2 * sizeof(double) * rowsperproc) !=
+ (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) {
+ printerr("Padding algorithm unsuccessful\n");
+ exit(-1);
+ }
+ }
+
+ Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory));
+ if (Global == NULL) {
+ printf("Could not malloc memory for Global (%d)\n",sizeof(struct GlobalMemory));
+ exit(-1);
+ }
+
+
+ x = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);
+ trans = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);
+ umain = (double *) G_MALLOC(2*rootN*sizeof(double));
+ umain2 = (double *) G_MALLOC(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE);
+
+
+ Global->transtimes = (int *) G_MALLOC(P*sizeof(int));
+ Global->totaltimes = (int *) G_MALLOC(P*sizeof(int));
+ if (x == NULL) {
+ printerr("Could not malloc memory for x\n");
+ exit(-1);
+ } else if (trans == NULL) {
+ printerr("Could not malloc memory for trans\n");
+ exit(-1);
+ } else if (umain == NULL) {
+ printerr("Could not malloc memory for umain\n");
+ exit(-1);
+ } else if (umain2 == NULL) {
+ printerr("Could not malloc memory for umain2\n");
+ exit(-1);
+ }
+
+ x = (double *) (((unsigned) x) + PAGE_SIZE - ((unsigned) x) % PAGE_SIZE);
+ trans = (double *) (((unsigned) trans) + PAGE_SIZE - ((unsigned) trans) % PAGE_SIZE);
+ umain2 = (double *) (((unsigned) umain2) + PAGE_SIZE - ((unsigned) umain2) % PAGE_SIZE);
+
+/* In order to optimize data distribution, the data structures x, trans,
+ and umain2 have been aligned so that each begins on a page boundary.
+ This ensures that the amount of padding calculated by the program is
+ such that each processor's partition ends on a page boundary, thus
+ ensuring that all data from these structures that are needed by a
+ processor can be allocated to its local memory */
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the x,
+ trans, and umain2 data structures across physically distributed
+ memories as desired.
+
+ One way to place data is as follows:
+
+ double *base;
+ int i;
+
+ i = ((N/P)+(rootN/P)*pad_length)*2;
+ base = &(x[0]);
+ for (j=0;j<P;j++) {
+ Place all addresses x such that (base <= x < base+i) on node j
+ base += i;
+ }
+
+ The trans and umain2 data structures can be placed in a similar manner.
+
+ */
+
+ printf("\n");
+ printf("FFT with Blocking Transpose\n");
+ printf(" %d Complex Doubles\n",N);
+ printf(" %d Threads\n",P);
+ if (num_cache_lines != orig_num_lines) {
+ printf(" %d Cache lines\n",orig_num_lines);
+ printf(" %d Cache lines for blocking transpose\n",num_cache_lines);
+ } else {
+ printf(" %d Cache lines\n",num_cache_lines);
+ }
+ printf(" %d Byte line size\n",(1 << log2_line_size));
+ printf(" %d Bytes per page\n",PAGE_SIZE);
+ printf("\n");
+
+ BARINIT(Global->start);
+ LOCKINIT(Global->idlock);
+ Global->id = 0;
+ InitX(N, x); /* place random values in x */
+
+ if (test_result) {
+ ck1 = CheckSum(N, x);
+ }
+ if (doprint) {
+ printf("Original data values:\n");
+ PrintArray(N, x);
+ }
+
+ InitU(N,umain); /* initialize u arrays*/
+ InitU2(N,umain2,rootN);
+
+ /* fire off P processes */
+ for (i=1; i<P; i++) {
+ CREATE(SlaveStart,thread_array[i]);
+ }
+
+ SlaveStart();
+
+ for (i=1; i<P; i++) {
+ while(!thread_array[i]) {};
+ }
+
+ if (doprint) {
+ if (test_result) {
+ printf("Data values after inverse FFT:\n");
+ } else {
+ printf("Data values after FFT:\n");
+ }
+ PrintArray(N, x);
+ }
+
+ transtime = Global->transtimes[0];
+ printf("\n");
+ printf(" PROCESS STATISTICS\n");
+ printf(" Computation Transpose Transpose\n");
+ printf(" Proc Time Time Fraction\n");
+ printf(" 0 %10d %10d %8.5f\n",
+ Global->totaltimes[0],Global->transtimes[0],
+ ((double)Global->transtimes[0])/Global->totaltimes[0]);
+ if (dostats) {
+ transtime2 = Global->transtimes[0];
+ avgtranstime = Global->transtimes[0];
+ avgcomptime = Global->totaltimes[0];
+ maxtotal = Global->totaltimes[0];
+ mintotal = Global->totaltimes[0];
+ maxfrac = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ minfrac = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ avgfractime = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ for (i=1;i<P;i++) {
+ if (Global->transtimes[i] > transtime) {
+ transtime = Global->transtimes[i];
+ }
+ if (Global->transtimes[i] < transtime2) {
+ transtime2 = Global->transtimes[i];
+ }
+ if (Global->totaltimes[i] > maxtotal) {
+ maxtotal = Global->totaltimes[i];
+ }
+ if (Global->totaltimes[i] < mintotal) {
+ mintotal = Global->totaltimes[i];
+ }
+ if (((double)Global->transtimes[i])/Global->totaltimes[i] > maxfrac) {
+ maxfrac = ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ if (((double)Global->transtimes[i])/Global->totaltimes[i] < minfrac) {
+ minfrac = ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ printf(" %3d %10d %10d %f\n",
+ i,Global->totaltimes[i],Global->transtimes[i],
+ ((double)Global->transtimes[i])/Global->totaltimes[i]);
+ avgtranstime += Global->transtimes[i];
+ avgcomptime += Global->totaltimes[i];
+ avgfractime += ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ printf(" Avg %10f %10f %f\n",
+ ((double) avgcomptime)/P,((double) avgtranstime)/P,avgfractime/P);
+ printf(" Max %10d %10d %f\n",
+ maxtotal,transtime,maxfrac);
+ printf(" Min %10d %10d %f\n",
+ mintotal,transtime2,minfrac);
+ }
+ Global->starttime = start;
+ printf("\n");
+ printf(" TIMING INFORMATION\n");
+ printf("Start time : %16d\n",
+ Global->starttime);
+ printf("Initialization finish time : %16d\n",
+ Global->initdonetime);
+ printf("Overall finish time : %16d\n",
+ Global->finishtime);
+ printf("Total time with initialization : %16d\n",
+ Global->finishtime-Global->starttime);
+ printf("Total time without initialization : %16d\n",
+ Global->finishtime-Global->initdonetime);
+ printf("Overall transpose time : %16d\n",
+ transtime);
+ printf("Overall transpose fraction : %f\n",
+ ((double) transtime)/(Global->finishtime-Global->initdonetime));
+ printf("\n");
+
+ if (test_result) {
+ ck3 = CheckSum(N, x);
+ printf(" INVERSE FFT TEST RESULTS\n");
+ printf("Checksum difference is %f (%f, %f)\n",
+ ck1-ck3, ck1, ck3);
+ if (fabs(ck1-ck3) < 0.001) {
+ printf("TEST PASSED\n");
+ } else {
+ printf("TEST FAILED\n");
+ }
+ }
+
+ MAIN_END;
+}
+
+
+void SlaveStart()
+{
+ int i;
+ int j;
+ int MyNum;
+ double error;
+ double *upriv;
+ int initdone;
+ int finish;
+ int l_transtime=0;
+ int MyFirst;
+ int MyLast;
+
+ LOCK(Global->idlock);
+ MyNum = Global->id;
+ Global->id++;
+#ifdef FFT_DBG
+ printf("Slave %d started\n",MyNum);
+#endif
+ UNLOCK(Global->idlock);
+
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ BARRIER(Global->start, P, 1);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 1\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ upriv = (double *) malloc(2*(rootN-1)*sizeof(double));
+ if (upriv == NULL) {
+ printf("Proc %d could not malloc memory for upriv\n",MyNum);
+ exit(-1);
+ }
+ for (i=0;i<2*(rootN-1);i++) {
+ upriv[i] = umain[i];
+ }
+
+ MyFirst = rootN*MyNum/P;
+ MyLast = rootN*(MyNum+1)/P;
+
+ TouchArray(x, trans, umain2, upriv, N, MyNum, MyFirst, MyLast);
+
+ BARRIER(Global->start, P, 2);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 2\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(initdone);
+ }
+
+ /* perform forward FFT */
+ FFT1D(1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst,
+ MyLast, pad_length, P, test_result, doprint, dostats, Global);
+
+ /* perform backward FFT */
+ if (test_result) {
+ FFT1D(-1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst,
+ MyLast, pad_length, P, test_result, doprint, dostats, Global);
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(finish);
+ Global->transtimes[MyNum] = l_transtime;
+ Global->totaltimes[MyNum] = finish-initdone;
+ }
+ if (MyNum == 0) {
+ Global->finishtime = finish;
+ Global->initdonetime = initdone;
+ }
+
+ thread_array[MyNum] = 1;
+}
+
+
+double TouchArray(x, scratch, u, upriv, N, MyNum, MyFirst, MyLast)
+
+double *x;
+double *scratch;
+double *u;
+double *upriv;
+int N;
+int MyNum;
+int MyFirst;
+int MyLast;
+
+{
+ int i,j,k;
+ double tot = 0.0;
+
+ /* touch my data */
+ for (j=0;j<2*(rootN-1);j++) {
+ tot += upriv[j];
+ }
+ for (j=MyFirst; j<MyLast; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ tot += x[2*(k+i)] + x[2*(k+i)+1] +
+ scratch[2*(k+i)] + scratch[2*(k+i)+1] +
+ u[2*(k+i)] + u[2*(k+i)+1];
+ }
+ }
+ return tot;
+}
+
+
+double CheckSum(N, x)
+
+int N;
+double *x;
+
+{
+ int i,j,k;
+ double cks;
+
+ cks = 0.0;
+ for (j=0; j<rootN; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ cks += x[2*(k+i)] + x[2*(k+i)+1];
+ }
+ }
+
+ return(cks);
+}
+
+
+InitX(N, x)
+
+int N;
+double *x;
+
+{
+ int i,j,k;
+
+ srand(0);
+ for (j=0; j<rootN; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ x[2*(k+i)] = rand()/MAXRAND;
+ x[2*(k+i)+1] = rand()/MAXRAND;
+ }
+ }
+}
+
+
+InitU(N, u)
+
+int N;
+double *u;
+
+{
+ int q;
+ int j;
+ int base;
+ int n1;
+
+ for (q=0; 1<<q<N; q++) {
+ n1 = 1<<q;
+ base = n1-1;
+ for (j=0; j<n1; j++) {
+ if (base+j > rootN-1) {
+ return;
+ }
+ u[2*(base+j)] = cos(2.0*PI*j/(2*n1));
+ u[2*(base+j)+1] = -sin(2.0*PI*j/(2*n1));
+ }
+ }
+}
+
+
+InitU2(N, u, n1)
+
+int N;
+double *u;
+int n1;
+
+{
+ int i,j,k;
+ int base;
+
+ for (j=0; j<n1; j++) {
+ k = j*(rootN+pad_length);
+ for (i=0; i<n1; i++) {
+ u[2*(k+i)] = cos(2.0*PI*i*j/(N));
+ u[2*(k+i)+1] = -sin(2.0*PI*i*j/(N));
+ }
+ }
+}
+
+
+BitReverse(M, k)
+
+int M;
+int k;
+
+{
+ int i;
+ int j;
+ int tmp;
+
+ j = 0;
+ tmp = k;
+ for (i=0; i<M; i++) {
+ j = 2*j + (tmp&0x1);
+ tmp = tmp>>1;
+ }
+ return(j);
+}
+
+
+void FFT1D(direction, M, N, x, scratch, upriv, umain2, MyNum, l_transtime,
+ MyFirst, MyLast, pad_length, P, test_result, doprint, dostats,
+ Global)
+
+int direction;
+int M;
+int N;
+int *l_transtime;
+double *x;
+double *upriv;
+double *scratch;
+double *umain2;
+int MyFirst;
+int MyLast;
+int pad_length;
+int P;
+int test_result;
+int doprint;
+int dostats;
+struct GlobalMemory *Global;
+
+{
+ int i;
+ int j;
+ int m1;
+ int n1;
+ int flag = 0;
+ unsigned int clocktime1;
+ unsigned int clocktime2;
+
+ m1 = M/2;
+ n1 = 1<<m1;
+
+ BARRIER(Global->start, P, 3);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 3\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime1);
+ }
+
+ /* transpose from x into scratch */
+ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime2);
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ /* do n1 1D FFTs on columns */
+ for (j=MyFirst; j<MyLast; j++) {
+ FFT1DOnce(direction, m1, n1, upriv, &scratch[2*j*(n1+pad_length)]);
+ TwiddleOneCol(direction, n1, N, j, umain2, &scratch[2*j*(n1+pad_length)],
+ pad_length);
+ }
+
+ BARRIER(Global->start, P, 4);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 4\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime1);
+ }
+ /* transpose */
+ Transpose(n1, scratch, x, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime2);
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ /* do n1 1D FFTs on columns again */
+ for (j=MyFirst; j<MyLast; j++) {
+ FFT1DOnce(direction, m1, n1, upriv, &x[2*j*(n1+pad_length)]);
+ if (direction == -1)
+ Scale(n1, N, &x[2*j*(n1+pad_length)]);
+ }
+
+ BARRIER(Global->start, P, 5);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 5\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime1);
+ }
+
+ /* transpose back */
+ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(clocktime2);
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ BARRIER(Global->start, P, 6);
+#ifdef FFT_DBG
+ LOCK(Global->idlock); printf("Slave %d left barrier 6\n",MyNum); UNLOCK(Global->idlock);
+#endif
+
+ /* copy columns from scratch to x */
+ if ((test_result) || (doprint)) {
+ for (j=MyFirst; j<MyLast; j++) {
+ CopyColumn(n1, &scratch[2*j*(n1+pad_length)], &x[2*j*(n1+pad_length)]);
+ }
+ }
+
+ BARRIER(Global->start, P, 7);
+}
+
+
+TwiddleOneCol(direction, n1, N, j, u, x, pad_length)
+
+int direction;
+int n1;
+int N;
+int j;
+double *u;
+double *x;
+int pad_length;
+
+{
+ int i;
+ double omega_r;
+ double omega_c;
+ double x_r;
+ double x_c;
+ double r1;
+ double c1;
+ double r2;
+ double c2;
+
+ for (i=0; i<n1; i++) {
+ omega_r = u[2*(j*(n1+pad_length)+i)];
+ omega_c = direction*u[2*(j*(n1+pad_length)+i)+1];
+ x_r = x[2*i];
+ x_c = x[2*i+1];
+ x[2*i] = omega_r*x_r - omega_c*x_c;
+ x[2*i+1] = omega_r*x_c + omega_c*x_r;
+ }
+}
+
+
+Scale(n1, N, x)
+
+int n1;
+int N;
+double *x;
+
+{
+ int i;
+
+ for (i=0; i<n1; i++) {
+ x[2*i] /= N;
+ x[2*i+1] /= N;
+ }
+}
+
+
+Transpose(n1, src, dest, MyNum, MyFirst, MyLast, pad_length)
+
+int n1;
+double *src;
+double *dest;
+int MyNum;
+int MyFirst;
+int MyLast;
+int pad_length;
+
+{
+ int i;
+ int j;
+ int k;
+ int l;
+ int m;
+ int blksize;
+ int numblks;
+ int firstfirst;
+ int h_off;
+ int v_off;
+ int v;
+ int h;
+ int n1p;
+ int row_count;
+
+ blksize = MyLast-MyFirst;
+ numblks = (2*blksize)/num_cache_lines;
+ if (numblks * num_cache_lines != 2 * blksize) {
+ numblks ++;
+ }
+ blksize = blksize / numblks;
+ firstfirst = MyFirst;
+ row_count = n1/P;
+ n1p = n1+pad_length;
+ for (l=MyNum+1;l<P;l++) {
+ v_off = l*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+ }
+
+ for (l=0;l<MyNum;l++) {
+ v_off = l*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+ }
+
+ v_off = MyNum*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+}
+
+
+CopyColumn(n1, src, dest)
+
+int n1;
+double *src;
+double *dest;
+
+{
+ int i;
+
+ for (i=0; i<n1; i++) {
+ dest[2*i] = src[2*i];
+ dest[2*i+1] = src[2*i+1];
+ }
+}
+
+
+Reverse(N, M, x)
+
+int N;
+int M;
+double *x;
+
+{
+ int j, k;
+
+ for (k=0; k<N; k++) {
+ j = BitReverse(M, k);
+ if (j > k) {
+ SWAP(x[2*j], x[2*k]);
+ SWAP(x[2*j+1], x[2*k+1]);
+ }
+ }
+}
+
+
+FFT1DOnce(direction, M, N, u, x)
+
+int direction;
+int M;
+int N;
+double *u;
+double *x;
+
+{
+ int j;
+ int k;
+ int q;
+ int L;
+ int r;
+ int Lstar;
+ double *u1;
+ double *x1;
+ double *x2;
+ double omega_r;
+ double omega_c;
+ double tau_r;
+ double tau_c;
+ double x_r;
+ double x_c;
+
+ Reverse(N, M, x);
+
+ for (q=1; q<=M; q++) {
+ L = 1<<q; r = N/L; Lstar = L/2;
+ u1 = &u[2*(Lstar-1)];
+ for (k=0; k<r; k++) {
+ x1 = &x[2*(k*L)];
+ x2 = &x[2*(k*L+Lstar)];
+ for (j=0; j<Lstar; j++) {
+ omega_r = u1[2*j];
+ omega_c = direction*u1[2*j+1];
+ x_r = x2[2*j];
+ x_c = x2[2*j+1];
+ tau_r = omega_r*x_r - omega_c*x_c;
+ tau_c = omega_r*x_c + omega_c*x_r;
+ x_r = x1[2*j];
+ x_c = x1[2*j+1];
+ x2[2*j] = x_r - tau_r;
+ x2[2*j+1] = x_c - tau_c;
+ x1[2*j] = x_r + tau_r;
+ x1[2*j+1] = x_c + tau_c;
+ }
+ }
+ }
+}
+
+
+PrintArray(N, x)
+
+int N;
+double *x;
+
+{
+ int i, j, k;
+
+ for (i=0; i<rootN; i++) {
+ k = i*(rootN+pad_length);
+ for (j=0; j<rootN; j++) {
+ printf(" %4.2f %4.2f", x[2*(k+j)], x[2*(k+j)+1]);
+ if (i*rootN+j != N-1) {
+ printf(",");
+ }
+ if ((i*rootN+j+1) % 8 == 0) {
+ printf("\n");
+ }
+ }
+ }
+ printf("\n");
+ printf("\n");
+}
+
+
+void printerr(s)
+
+char *s;
+
+{
+ printf("ERROR: %s\n",s);
+}
+
+
+int log_2(number)
+
+int number;
+
+{
+ int cumulative = 1;
+ int out = 0;
+ int done = 0;
+
+ while ((cumulative < number) && (!done) && (out < 50)) {
+ if (cumulative == number) {
+ done = 1;
+ } else {
+ cumulative = cumulative * 2;
+ out ++;
+ }
+ }
+
+ if (cumulative == number) {
+ return(out);
+ } else {
+ return(-1);
+ }
+}
+
+#include <tests_smp/getopt.c>
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/fft.c b/cesar/ecos/packages/kernel/current/tests_smp/fft.c
new file mode 100644
index 0000000000..0222677c33
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/fft.c
@@ -0,0 +1,1460 @@
+
+
+
+
+
+
+
+
+
+
+
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Perform 1D fast Fourier transform using six-step FFT method */
+/* */
+/* 1) Performs staggered, blocked transposes for cache-line reuse */
+/* 2) Roots of unity rearranged and distributed for only local */
+/* accesses during application of roots of unity */
+/* 3) Small set of roots of unity elements replicated locally for */
+/* 1D FFTs (less than root N elements replicated at each node) */
+/* 4) Matrix data structures are padded to reduce cache mapping */
+/* conflicts */
+/* */
+/* Command line options: */
+/* */
+/* -mM : M = even integer; 2**M total complex data points transformed. */
+/* -pP : P = number of processors; Must be a power of 2. */
+/* -nN : N = number of cache lines. */
+/* -lL : L = Log base 2 of cache line length in bytes. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Perform FFT and inverse FFT. Test output by comparing the */
+/* integral of the original data to the integral of the data */
+/* that results from performing the FFT and inverse FFT. */
+/* -o : Print out complex data points. */
+/* -h : Print out command line options. */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <tests_smp/getopt.h>
+
+#define MAXRAND 2147483647
+#define PAGE_SIZE 4096
+#define NUM_CACHE_LINES 65536
+#define LOG2_LINE_SIZE 4
+#define PI 3.1416
+#define DEFAULT_M 10
+#define DEFAULT_P 1
+
+#include <tests_smp/fft_arg.c>
+
+
+/*---------------------------------------*/
+#include <pkgconf/system.h>
+#include <pkgconf/infra.h>
+#include <pkgconf/kernel.h>
+#include <cyg/infra/cyg_type.h>
+#include <cyg/infra/testcase.h>
+#include <pkgconf/isoinfra.h>
+#include <cyg/hal/hal_cache.h>
+#include <cyg/hal/hal_smp.h>
+#include <cyg/kernel/kapi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+int smp_cyg_test_main(int argc, char **argv);
+
+void smp_cyg_test_main_call(void *p) {
+ smp_cyg_test_main(smp_cyg_test_argc, smp_cyg_test_argv);
+}
+
+
+/* #define STACK_SIZE 0x1b000 */
+#define STACK_SIZE 8192
+static cyg_thread smp_cyg_test_thread;
+static char smp_cyg_test_stack[STACK_SIZE];
+static cyg_handle_t smp_cyg_test_threadh;
+
+
+
+cyg_handle_t threadh[64];
+char *stack[64];
+cyg_thread thread[64];
+
+
+externC void cyg_user_start( void )
+{
+ CYG_TEST_INIT();
+
+ diag_printf("Starting test app\n");
+
+ cyg_thread_create(10, // Priority - just a number
+ smp_cyg_test_main_call, // entry
+ 0, // index
+ "smp test", // Name
+ smp_cyg_test_stack, // Stack
+ STACK_SIZE, // Size
+ &smp_cyg_test_threadh, // Handle
+ &smp_cyg_test_thread // Thread data structure
+ );
+ cyg_thread_resume( smp_cyg_test_threadh );
+ /*cyg_scheduler_start();*/
+}
+
+
+/*---------------------------------------*/
+
+
+#define SWAP(a,b) {double tmp; tmp=a; a=b; b=tmp;}
+
+struct GlobalMemory {
+ int id;
+ cyg_spinlock_t idlock;
+
+/*---------------------------------------*/
+struct startTYP {
+ cyg_spinlock_t lock;;
+ int count[1];
+ cyg_sem_t queue[1];;
+
+ } start[1];
+/*---------------------------------------*/
+
+
+ int *transtimes;
+ int *totaltimes;
+ int starttime;
+ int finishtime;
+ int initdonetime;
+} *Global;
+
+int dbg_on = 0;
+
+int P = DEFAULT_P;
+int M = DEFAULT_M;
+int N; /* N = 2^M */
+int rootN; /* rootN = N^1/2 */
+double *x; /* x is the original time-domain data */
+double *trans; /* trans is used as scratch space */
+double *umain; /* umain is roots of unity for 1D FFTs */
+double *umain2; /* umain2 is entire roots of unity matrix */
+int test_result = 0;
+int doprint = 0;
+int dostats = 0;
+int transtime = 0;
+int transtime2 = 0;
+int avgtranstime = 0;
+int avgcomptime = 0;
+unsigned int transstart = 0;
+unsigned int transend = 0;
+int maxtotal=0;
+int mintotal=0;
+double maxfrac=0;
+double minfrac=0;
+double avgfractime=0;
+int orig_num_lines = NUM_CACHE_LINES; /* number of cache lines */
+int num_cache_lines = NUM_CACHE_LINES; /* number of cache lines */
+int log2_line_size = LOG2_LINE_SIZE;
+int line_size;
+int rowsperproc;
+double ck1;
+double ck3; /* checksums for testing answer */
+int pad_length;
+
+void SlaveStart();
+double TouchArray(double *,double *,double *,double *,int,int,int,int);
+void FFT1D(int,int,int,double *,double *,double *,double *,int,int *,int,
+ int,int,int,int,int,int,struct GlobalMemory *);
+double CheckSum();
+double drand48();
+int log_2(int);
+void printerr(char *);
+volatile int thread_array[4] = {0, 0, 0, 0};
+
+int smp_cyg_test_main(int argc, char **argv)
+{
+ int i;
+ int j;
+ int c;
+ extern char *optarg;
+ int m1;
+ int factor;
+ int pages;
+ unsigned int start;
+ //memset (thread_array,0,sizeof(thread_array));
+
+ {
+ (start) = cyg_current_time()*10;
+}
+;
+
+
+// extern Cyg_Mempool_dlmalloc *cygmem_memalloc_heaps[ 2 ];
+// cygmem_memalloc_heaps[0].
+
+
+ // printf("<cpu%d> thread %d Arguments (%d): ",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),argc);
+// for (i = 0;i < argc;i++){
+// printf("%s ",argv[i]);
+// }
+// printf("\n");
+
+ while ((c = getopt(argc, argv, "p:m:n:l:stoh")) != -1) {
+ switch(c) {
+ case 'p': P = atoi(optarg);
+ P = HAL_SMP_CPU_MAX;
+ if (P < 1) {
+ printerr("P must be >= 1\n");
+ exit(-1);
+ }
+ if (log_2(P) == -1) {
+ printerr("P must be a power of 2\n");
+ exit(-1);
+ }
+ break;
+ case 'm': M = atoi(optarg);
+ m1 = M/2;
+ if (2*m1 != M) {
+ printerr("M must be even\n");
+ exit(-1);
+ }
+ break;
+ case 'n': num_cache_lines = atoi(optarg);
+ orig_num_lines = num_cache_lines;
+ if (num_cache_lines < 1) {
+ printerr("Number of cache lines must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 'l': log2_line_size = atoi(optarg);
+ if (log2_line_size < 0) {
+ printerr("Log base 2 of cache line length in bytes must be >= 0\n");
+ exit(-1);
+ }
+ break;
+ case 's': dostats = !dostats;
+ break;
+ case 't': test_result = !test_result;
+ break;
+ case 'o': doprint = !doprint;
+ break;
+ case 'h': printf("Usage: FFT <options>\n\n");
+ printf("options:\n");
+ printf(" -mM : M = even integer; 2**M total complex data points transformed.\n");
+ printf(" -pP : P = number of processors; Must be a power of 2.\n");
+ printf(" -nN : N = number of cache lines.\n");
+ printf(" -lL : L = Log base 2 of cache line length in bytes.\n");
+ printf(" -s : Print individual processor timing statistics.\n");
+ printf(" -t : Perform FFT and inverse FFT. Test output by comparing the\n");
+ printf(" integral of the original data to the integral of the data that\n");
+ printf(" results from performing the FFT and inverse FFT.\n");
+ printf(" -o : Print out complex data points.\n");
+ printf(" -h : Print out command line options.\n\n");
+ printf("Default: FFT -m%1d -p%1d -n%1d -l%1d\n",
+ DEFAULT_M,DEFAULT_P,NUM_CACHE_LINES,LOG2_LINE_SIZE);
+ exit(0);
+ break;
+ }
+ }
+
+ if (P > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ {;};
+
+ N = 1<<M;
+ rootN = 1<<(M/2);
+ rowsperproc = rootN/P;
+ if (rowsperproc == 0) {
+ printerr("Matrix not large enough. 2**(M/2) must be >= P\n");
+ exit(-1);
+ }
+
+ line_size = 1 << log2_line_size;
+ if (line_size < 2*sizeof(double)) {
+ printf("WARNING: Each element is a complex double (%d bytes)\n",2*sizeof(double));
+ printf(" => Less than one element per cache line\n");
+ printf(" Computing transpose blocking factor\n");
+ factor = (2*sizeof(double)) / line_size;
+ num_cache_lines = orig_num_lines / factor;
+ }
+ if (line_size <= 2*sizeof(double)) {
+ pad_length = 1;
+ } else {
+ pad_length = line_size / (2*sizeof(double));
+ }
+
+ if (rowsperproc * rootN * 2 * sizeof(double) >= PAGE_SIZE) {
+ pages = (2 * pad_length * sizeof(double) * rowsperproc) / PAGE_SIZE;
+ if (pages * PAGE_SIZE != 2 * pad_length * sizeof(double) * rowsperproc) {
+ pages ++;
+ }
+ pad_length = (pages * PAGE_SIZE) / (2 * sizeof(double) * rowsperproc);
+ } else {
+ pad_length = (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double))) /
+
+ (2 * sizeof(double) * rowsperproc);
+ if (pad_length * (2 * sizeof(double) * rowsperproc) !=
+ (PAGE_SIZE - (rowsperproc * rootN * 2 * sizeof(double)))) {
+ printerr("Padding algorithm unsuccessful\n");
+ exit(-1);
+ }
+ }
+
+ Global = (struct GlobalMemory *) calloc(sizeof(struct GlobalMemory), 1);;
+ if (Global == NULL) {
+ printf("Could not malloc memory for Global (%d)\n",sizeof(struct GlobalMemory));
+ exit(-1);
+ }
+
+
+ x = (double *) calloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE, 1);;
+ trans = (double *) calloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE, 1);;
+ umain = (double *) calloc(2*rootN*sizeof(double), 1);;
+ umain2 = (double *) calloc(2*(N+rootN*pad_length)*sizeof(double)+PAGE_SIZE, 1);;
+
+
+ Global->transtimes = (int *) calloc(P*sizeof(int), 1);;
+ Global->totaltimes = (int *) calloc(P*sizeof(int), 1);;
+ if (x == NULL) {
+ printerr("Could not malloc memory for x\n");
+ exit(-1);
+ } else if (trans == NULL) {
+ printerr("Could not malloc memory for trans\n");
+ exit(-1);
+ } else if (umain == NULL) {
+ printerr("Could not malloc memory for umain\n");
+ exit(-1);
+ } else if (umain2 == NULL) {
+ printerr("Could not malloc memory for umain2\n");
+ exit(-1);
+ }
+
+ x = (double *) (((unsigned) x) + PAGE_SIZE - ((unsigned) x) % PAGE_SIZE);
+ trans = (double *) (((unsigned) trans) + PAGE_SIZE - ((unsigned) trans) % PAGE_SIZE);
+ umain2 = (double *) (((unsigned) umain2) + PAGE_SIZE - ((unsigned) umain2) % PAGE_SIZE);
+
+/* In order to optimize data distribution, the data structures x, trans,
+ and umain2 have been aligned so that each begins on a page boundary.
+ This ensures that the amount of padding calculated by the program is
+ such that each processor's partition ends on a page boundary, thus
+ ensuring that all data from these structures that are needed by a
+ processor can be allocated to its local memory */
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the x,
+ trans, and umain2 data structures across physically distributed
+ memories as desired.
+
+ One way to place data is as follows:
+
+ double *base;
+ int i;
+
+ i = ((N/P)+(rootN/P)*pad_length)*2;
+ base = &(x[0]);
+ for (j=0;j<P;j++) {
+ Place all addresses x such that (base <= x < base+i) on node j
+ base += i;
+ }
+
+ The trans and umain2 data structures can be placed in a similar manner.
+
+ */
+
+ printf("\n");
+ printf("FFT with Blocking Transpose\n");
+ printf(" %d Complex Doubles\n",N);
+ printf(" %d Threads\n",P);
+ if (num_cache_lines != orig_num_lines) {
+ printf(" %d Cache lines\n",orig_num_lines);
+ printf(" %d Cache lines for blocking transpose\n",num_cache_lines);
+ } else {
+ printf(" %d Cache lines\n",num_cache_lines);
+ }
+ printf(" %d Byte line size\n",(1 << log2_line_size));
+ printf(" %d Bytes per page\n",PAGE_SIZE);
+ printf("\n");
+
+ {{
+/*---------------------------------------*/
+ int mon_dum1,mon_dum2;
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++)
+ for (mon_dum2=0; mon_dum2 < 1; mon_dum2++) {
+ Global->start[mon_dum1].count[mon_dum2] = 0;
+ { cyg_semaphore_init(&Global->start[mon_dum1].queue[mon_dum2],0);};
+ }
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++) {
+ cyg_spinlock_init( &Global->start[mon_dum1].lock,0 );;
+ }
+/*---------------------------------------*/
+}};
+ cyg_spinlock_init( &Global->idlock,0 );;
+ Global->id = 0;
+ InitX(N, x); /* place random values in x */
+
+ if (test_result) {
+ ck1 = CheckSum(N, x);
+ }
+ if (doprint) {
+ printf("Original data values:\n");
+ PrintArray(N, x);
+ }
+
+ InitU(N,umain); /* initialize u arrays*/
+ InitU2(N,umain2,rootN);
+
+ /* fire off P processes */
+ for (i=1; i<P; i++) {
+
+{
+ /*---------------------------------------*/
+ stack[i] = calloc(STACK_SIZE, 1);
+ cyg_thread_create(10, // Priority - just a number
+ SlaveStart, // entry
+ 0, // index
+ "slave", // Name
+ stack[i], // Stack
+ STACK_SIZE, // Size
+ &threadh[i], // Handle
+ &thread[i] // Thread data structure
+ );
+ cyg_thread_resume( threadh[i] );
+ /*---------------------------------------*/
+}
+;
+ }
+
+ SlaveStart();
+
+ for (i=1; i<P; i++) {
+ while(!thread_array[i]) {};
+ }
+
+ if (doprint) {
+ if (test_result) {
+ printf("Data values after inverse FFT:\n");
+ } else {
+ printf("Data values after FFT:\n");
+ }
+ PrintArray(N, x);
+ }
+
+ transtime = Global->transtimes[0];
+ printf("\n");
+ printf(" PROCESS STATISTICS\n");
+ printf(" Computation Transpose Transpose\n");
+ printf(" Proc Time Time Fraction\n");
+ printf(" 0 %10d %10d %8.5f\n",
+ Global->totaltimes[0],Global->transtimes[0],
+ ((double)Global->transtimes[0])/Global->totaltimes[0]);
+ if (dostats) {
+ transtime2 = Global->transtimes[0];
+ avgtranstime = Global->transtimes[0];
+ avgcomptime = Global->totaltimes[0];
+ maxtotal = Global->totaltimes[0];
+ mintotal = Global->totaltimes[0];
+ maxfrac = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ minfrac = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ avgfractime = ((double)Global->transtimes[0])/Global->totaltimes[0];
+ for (i=1;i<P;i++) {
+ if (Global->transtimes[i] > transtime) {
+ transtime = Global->transtimes[i];
+ }
+ if (Global->transtimes[i] < transtime2) {
+ transtime2 = Global->transtimes[i];
+ }
+ if (Global->totaltimes[i] > maxtotal) {
+ maxtotal = Global->totaltimes[i];
+ }
+ if (Global->totaltimes[i] < mintotal) {
+ mintotal = Global->totaltimes[i];
+ }
+ if (((double)Global->transtimes[i])/Global->totaltimes[i] > maxfrac) {
+ maxfrac = ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ if (((double)Global->transtimes[i])/Global->totaltimes[i] < minfrac) {
+ minfrac = ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ printf(" %3d %10d %10d %f\n",
+ i,Global->totaltimes[i],Global->transtimes[i],
+ ((double)Global->transtimes[i])/Global->totaltimes[i]);
+ avgtranstime += Global->transtimes[i];
+ avgcomptime += Global->totaltimes[i];
+ avgfractime += ((double)Global->transtimes[i])/Global->totaltimes[i];
+ }
+ printf(" Avg %10f %10f %f\n",
+ ((double) avgcomptime)/P,((double) avgtranstime)/P,avgfractime/P);
+ printf(" Max %10d %10d %f\n",
+ maxtotal,transtime,maxfrac);
+ printf(" Min %10d %10d %f\n",
+ mintotal,transtime2,minfrac);
+ }
+ Global->starttime = start;
+ printf("\n");
+ printf(" TIMING INFORMATION\n");
+ printf("Start time : %16d\n",
+ Global->starttime);
+ printf("Initialization finish time : %16d\n",
+ Global->initdonetime);
+ printf("Overall finish time : %16d\n",
+ Global->finishtime);
+ printf("Total time with initialization : %16d\n",
+ Global->finishtime-Global->starttime);
+ printf("Total time without initialization : %16d\n",
+ Global->finishtime-Global->initdonetime);
+ printf("Overall transpose time : %16d\n",
+ transtime);
+ printf("Overall transpose fraction : %f\n",
+ ((double) transtime)/(Global->finishtime-Global->initdonetime));
+ printf("\n");
+
+ if (test_result) {
+ ck3 = CheckSum(N, x);
+ printf(" INVERSE FFT TEST RESULTS\n");
+ printf("Checksum difference is %f (%f, %f)\n",
+ ck1-ck3, ck1, ck3);
+ if (fabs(ck1-ck3) < 0.001) {
+ printf("TEST PASSED\n");
+ } else {
+ printf("TEST FAILED\n");
+ }
+ }
+
+
+{
+ diag_printf("FP test end\n");
+ return 0;
+}
+;
+}
+
+
+void SlaveStart()
+{
+ int i;
+ int j;
+ int MyNum;
+ double error;
+ double *upriv;
+ int initdone;
+ int finish;
+ int l_transtime=0;
+ int MyFirst;
+ int MyLast;
+
+ cyg_spinlock_spin(&Global->idlock);;
+ MyNum = Global->id;
+ Global->id++;
+#ifdef FFT_DBG
+ printf("Slave %d started\n",MyNum);
+#endif
+ cyg_spinlock_clear(&Global->idlock);;
+
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+#ifdef FFT_DBG
+ cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 1\n",MyNum); cyg_spinlock_clear(&Global->idlock);;
+#endif
+
+ upriv = (double *) malloc(2*(rootN-1)*sizeof(double));
+ if (upriv == NULL) {
+ printf("Proc %d could not malloc memory for upriv\n",MyNum);
+ exit(-1);
+ }
+ for (i=0;i<2*(rootN-1);i++) {
+ upriv[i] = umain[i];
+ }
+
+ MyFirst = rootN*MyNum/P;
+ MyLast = rootN*(MyNum+1)/P;
+
+ TouchArray(x, trans, umain2, upriv, N, MyNum, MyFirst, MyLast);
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+#ifdef FFT_DBG
+ cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 2\n",MyNum); cyg_spinlock_clear(&Global->idlock);;
+#endif
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (initdone) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* perform forward FFT */
+ FFT1D(1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst,
+ MyLast, pad_length, P, test_result, doprint, dostats, Global);
+
+ /* perform backward FFT */
+ if (test_result) {
+ FFT1D(-1, M, N, x, trans, upriv, umain2, MyNum, &l_transtime, MyFirst,
+ MyLast, pad_length, P, test_result, doprint, dostats, Global);
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (finish) = cyg_current_time()*10;
+}
+;
+ Global->transtimes[MyNum] = l_transtime;
+ Global->totaltimes[MyNum] = finish-initdone;
+ }
+ if (MyNum == 0) {
+ Global->finishtime = finish;
+ Global->initdonetime = initdone;
+ }
+
+ thread_array[MyNum] = 1;
+}
+
+
+double TouchArray(x, scratch, u, upriv, N, MyNum, MyFirst, MyLast)
+
+double *x;
+double *scratch;
+double *u;
+double *upriv;
+int N;
+int MyNum;
+int MyFirst;
+int MyLast;
+
+{
+ int i,j,k;
+ double tot = 0.0;
+
+ /* touch my data */
+ for (j=0;j<2*(rootN-1);j++) {
+ tot += upriv[j];
+ }
+ for (j=MyFirst; j<MyLast; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ tot += x[2*(k+i)] + x[2*(k+i)+1] +
+ scratch[2*(k+i)] + scratch[2*(k+i)+1] +
+ u[2*(k+i)] + u[2*(k+i)+1];
+ }
+ }
+ return tot;
+}
+
+
+double CheckSum(N, x)
+
+int N;
+double *x;
+
+{
+ int i,j,k;
+ double cks;
+
+ cks = 0.0;
+ for (j=0; j<rootN; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ cks += x[2*(k+i)] + x[2*(k+i)+1];
+ }
+ }
+
+ return(cks);
+}
+
+
+InitX(N, x)
+
+int N;
+double *x;
+
+{
+ int i,j,k;
+
+ srand(0);
+ for (j=0; j<rootN; j++) {
+ k = j * (rootN + pad_length);
+ for (i=0;i<rootN;i++) {
+ x[2*(k+i)] = rand()/MAXRAND;
+ x[2*(k+i)+1] = rand()/MAXRAND;
+ }
+ }
+}
+
+
+InitU(N, u)
+
+int N;
+double *u;
+
+{
+ int q;
+ int j;
+ int base;
+ int n1;
+
+ for (q=0; 1<<q<N; q++) {
+ n1 = 1<<q;
+ base = n1-1;
+ for (j=0; j<n1; j++) {
+ if (base+j > rootN-1) {
+ return;
+ }
+ u[2*(base+j)] = cos(2.0*PI*j/(2*n1));
+ u[2*(base+j)+1] = -sin(2.0*PI*j/(2*n1));
+ }
+ }
+}
+
+
+InitU2(N, u, n1)
+
+int N;
+double *u;
+int n1;
+
+{
+ int i,j,k;
+ int base;
+
+ for (j=0; j<n1; j++) {
+ k = j*(rootN+pad_length);
+ for (i=0; i<n1; i++) {
+ u[2*(k+i)] = cos(2.0*PI*i*j/(N));
+ u[2*(k+i)+1] = -sin(2.0*PI*i*j/(N));
+ }
+ }
+}
+
+
+BitReverse(M, k)
+
+int M;
+int k;
+
+{
+ int i;
+ int j;
+ int tmp;
+
+ j = 0;
+ tmp = k;
+ for (i=0; i<M; i++) {
+ j = 2*j + (tmp&0x1);
+ tmp = tmp>>1;
+ }
+ return(j);
+}
+
+
+void FFT1D(direction, M, N, x, scratch, upriv, umain2, MyNum, l_transtime,
+ MyFirst, MyLast, pad_length, P, test_result, doprint, dostats,
+ Global)
+
+int direction;
+int M;
+int N;
+int *l_transtime;
+double *x;
+double *upriv;
+double *scratch;
+double *umain2;
+int MyFirst;
+int MyLast;
+int pad_length;
+int P;
+int test_result;
+int doprint;
+int dostats;
+struct GlobalMemory *Global;
+
+{
+ int i;
+ int j;
+ int m1;
+ int n1;
+ int flag = 0;
+ unsigned int clocktime1;
+ unsigned int clocktime2;
+
+ m1 = M/2;
+ n1 = 1<<m1;
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+#ifdef FFT_DBG
+ cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 3\n",MyNum); cyg_spinlock_clear(&Global->idlock);;
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (clocktime1) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* transpose from x into scratch */
+ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (clocktime2) = cyg_current_time()*10;
+}
+;
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ /* do n1 1D FFTs on columns */
+ for (j=MyFirst; j<MyLast; j++) {
+ FFT1DOnce(direction, m1, n1, upriv, &scratch[2*j*(n1+pad_length)]);
+ TwiddleOneCol(direction, n1, N, j, umain2, &scratch[2*j*(n1+pad_length)],
+ pad_length);
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+#ifdef FFT_DBG
+ cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 4\n",MyNum); cyg_spinlock_clear(&Global->idlock);;
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (clocktime1) = cyg_current_time()*10;
+}
+;
+ }
+ /* transpose */
+ Transpose(n1, scratch, x, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (clocktime2) = cyg_current_time()*10;
+}
+;
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ /* do n1 1D FFTs on columns again */
+ for (j=MyFirst; j<MyLast; j++) {
+ FFT1DOnce(direction, m1, n1, upriv, &x[2*j*(n1+pad_length)]);
+ if (direction == -1)
+ Scale(n1, N, &x[2*j*(n1+pad_length)]);
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+#ifdef FFT_DBG
+ cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 5\n",MyNum); cyg_spinlock_clear(&Global->idlock);;
+#endif
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (clocktime1) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* transpose back */
+ Transpose(n1, x, scratch, MyNum, MyFirst, MyLast, pad_length);
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (clocktime2) = cyg_current_time()*10;
+}
+;
+ *l_transtime += (clocktime2-clocktime1);
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+#ifdef FFT_DBG
+ cyg_spinlock_spin(&Global->idlock);; printf("Slave %d left barrier 6\n",MyNum); cyg_spinlock_clear(&Global->idlock);;
+#endif
+
+ /* copy columns from scratch to x */
+ if ((test_result) || (doprint)) {
+ for (j=MyFirst; j<MyLast; j++) {
+ CopyColumn(n1, &scratch[2*j*(n1+pad_length)], &x[2*j*(n1+pad_length)]);
+ }
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+}
+
+
+TwiddleOneCol(direction, n1, N, j, u, x, pad_length)
+
+int direction;
+int n1;
+int N;
+int j;
+double *u;
+double *x;
+int pad_length;
+
+{
+ int i;
+ double omega_r;
+ double omega_c;
+ double x_r;
+ double x_c;
+ double r1;
+ double c1;
+ double r2;
+ double c2;
+
+ for (i=0; i<n1; i++) {
+ omega_r = u[2*(j*(n1+pad_length)+i)];
+ omega_c = direction*u[2*(j*(n1+pad_length)+i)+1];
+ x_r = x[2*i];
+ x_c = x[2*i+1];
+ x[2*i] = omega_r*x_r - omega_c*x_c;
+ x[2*i+1] = omega_r*x_c + omega_c*x_r;
+ }
+}
+
+
+Scale(n1, N, x)
+
+int n1;
+int N;
+double *x;
+
+{
+ int i;
+
+ for (i=0; i<n1; i++) {
+ x[2*i] /= N;
+ x[2*i+1] /= N;
+ }
+}
+
+
+Transpose(n1, src, dest, MyNum, MyFirst, MyLast, pad_length)
+
+int n1;
+double *src;
+double *dest;
+int MyNum;
+int MyFirst;
+int MyLast;
+int pad_length;
+
+{
+ int i;
+ int j;
+ int k;
+ int l;
+ int m;
+ int blksize;
+ int numblks;
+ int firstfirst;
+ int h_off;
+ int v_off;
+ int v;
+ int h;
+ int n1p;
+ int row_count;
+
+ blksize = MyLast-MyFirst;
+ numblks = (2*blksize)/num_cache_lines;
+ if (numblks * num_cache_lines != 2 * blksize) {
+ numblks ++;
+ }
+ blksize = blksize / numblks;
+ firstfirst = MyFirst;
+ row_count = n1/P;
+ n1p = n1+pad_length;
+ for (l=MyNum+1;l<P;l++) {
+ v_off = l*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+ }
+
+ for (l=0;l<MyNum;l++) {
+ v_off = l*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+ }
+
+ v_off = MyNum*row_count;
+ for (k=0; k<numblks; k++) {
+ h_off = firstfirst;
+ for (m=0; m<numblks; m++) {
+ for (i=0; i<blksize; i++) {
+ v = v_off + i;
+ for (j=0; j<blksize; j++) {
+ h = h_off + j;
+ dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
+ dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
+ }
+ }
+ h_off += blksize;
+ }
+ v_off+=blksize;
+ }
+}
+
+
+CopyColumn(n1, src, dest)
+
+int n1;
+double *src;
+double *dest;
+
+{
+ int i;
+
+ for (i=0; i<n1; i++) {
+ dest[2*i] = src[2*i];
+ dest[2*i+1] = src[2*i+1];
+ }
+}
+
+
+Reverse(N, M, x)
+
+int N;
+int M;
+double *x;
+
+{
+ int j, k;
+
+ for (k=0; k<N; k++) {
+ j = BitReverse(M, k);
+ if (j > k) {
+ SWAP(x[2*j], x[2*k]);
+ SWAP(x[2*j+1], x[2*k+1]);
+ }
+ }
+}
+
+
+FFT1DOnce(direction, M, N, u, x)
+
+int direction;
+int M;
+int N;
+double *u;
+double *x;
+
+{
+ int j;
+ int k;
+ int q;
+ int L;
+ int r;
+ int Lstar;
+ double *u1;
+ double *x1;
+ double *x2;
+ double omega_r;
+ double omega_c;
+ double tau_r;
+ double tau_c;
+ double x_r;
+ double x_c;
+
+ Reverse(N, M, x);
+
+ for (q=1; q<=M; q++) {
+ L = 1<<q; r = N/L; Lstar = L/2;
+ u1 = &u[2*(Lstar-1)];
+ for (k=0; k<r; k++) {
+ x1 = &x[2*(k*L)];
+ x2 = &x[2*(k*L+Lstar)];
+ for (j=0; j<Lstar; j++) {
+ omega_r = u1[2*j];
+ omega_c = direction*u1[2*j+1];
+ x_r = x2[2*j];
+ x_c = x2[2*j+1];
+ tau_r = omega_r*x_r - omega_c*x_c;
+ tau_c = omega_r*x_c + omega_c*x_r;
+ x_r = x1[2*j];
+ x_c = x1[2*j+1];
+ x2[2*j] = x_r - tau_r;
+ x2[2*j+1] = x_c - tau_c;
+ x1[2*j] = x_r + tau_r;
+ x1[2*j+1] = x_c + tau_c;
+ }
+ }
+ }
+}
+
+
+PrintArray(N, x)
+
+int N;
+double *x;
+
+{
+ int i, j, k;
+
+ for (i=0; i<rootN; i++) {
+ k = i*(rootN+pad_length);
+ for (j=0; j<rootN; j++) {
+ printf(" %4.2f %4.2f", x[2*(k+j)], x[2*(k+j)+1]);
+ if (i*rootN+j != N-1) {
+ printf(",");
+ }
+ if ((i*rootN+j+1) % 8 == 0) {
+ printf("\n");
+ }
+ }
+ }
+ printf("\n");
+ printf("\n");
+}
+
+
+void printerr(s)
+
+char *s;
+
+{
+ printf("ERROR: %s\n",s);
+}
+
+
+int log_2(number)
+
+int number;
+
+{
+ int cumulative = 1;
+ int out = 0;
+ int done = 0;
+
+ while ((cumulative < number) && (!done) && (out < 50)) {
+ if (cumulative == number) {
+ done = 1;
+ } else {
+ cumulative = cumulative * 2;
+ out ++;
+ }
+ }
+
+ if (cumulative == number) {
+ return(out);
+ } else {
+ return(-1);
+ }
+}
+
+#include <tests_smp/getopt.c>
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/fft_arg.c b/cesar/ecos/packages/kernel/current/tests_smp/fft_arg.c
new file mode 100644
index 0000000000..acf764a06d
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/fft_arg.c
@@ -0,0 +1,18 @@
+
+//___________________________________________
+#define smp_cyg_test_argc 11
+char *_cyg_argv[] = {
+ "fft.exe",
+ "-s",
+ "-t",
+ "-l",
+ "5",
+ "-n",
+ "256",
+ "-p",
+ "2", /* not used, using HAL_SMP_CPU_MAX */
+ "-m",
+ "16"
+};
+#define smp_cyg_test_argv &_cyg_argv
+//'''''''''''''''''''''''''''''''''''''''''''
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/getopt.c b/cesar/ecos/packages/kernel/current/tests_smp/getopt.c
new file mode 100644
index 0000000000..7645ec44d6
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/getopt.c
@@ -0,0 +1,654 @@
+/* Getopt for GNU.
+ NOTE: getopt is now part of the C library, so if you don't know what
+ "Keep this file name-space clean" means, talk to roland@gnu.ai.mit.edu
+ before changing it!
+
+ Copyright (C) 1987, 88, 89, 90, 91, 92, 1993
+ Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __STDC__
+# ifndef const
+# define const
+# endif
+#endif
+
+/* This tells Alpha OSF/1 not to define a getopt prototype in <stdio.h>. */
+#ifndef _NO_PROTO
+#define _NO_PROTO
+#endif
+
+#include <stdio.h>
+
+/* Comment out all this code if we are using the GNU C Library, and are not
+ actually compiling the library itself. This code is part of the GNU C
+ Library, but also included in many other GNU distributions. Compiling
+ and linking in this code is a waste when using the GNU C library
+ (especially if it is a shared library). Rather than having every GNU
+ program understand `configure --with-gnu-libc' and omit the object files,
+ it is simpler to just do this in the source for each such file. */
+
+
+/* This needs to come after some library #include
+ to get __GNU_LIBRARY__ defined. */
+#ifdef __GNU_LIBRARY__
+/* Don't include stdlib.h for non-GNU C libraries because some of them
+ contain conflicting prototypes for getopt. */
+#include <stdlib.h>
+#endif /* GNU C library. */
+
+/* If GETOPT_COMPAT is defined, `+' as well as `--' can introduce a
+ long-named option. Because this is not POSIX.2 compliant, it is
+ being phased out. */
+/* #define GETOPT_COMPAT */
+
+/* This version of `getopt' appears to the caller like standard Unix `getopt'
+ but it behaves differently for the user, since it allows the user
+ to intersperse the options with the other arguments.
+
+ As `getopt' works, it permutes the elements of ARGV so that,
+ when it is done, all the options precede everything else. Thus
+ all application programs are extended to handle flexible argument order.
+
+ Setting the environment variable POSIXLY_CORRECT disables permutation.
+ Then the behavior is completely standard.
+
+ GNU application programs can use a third alternative mode in which
+ they can distinguish the relative order of options and other arguments. */
+
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+char *optarg = 0;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns EOF, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+/* XXX 1003.2 says this must be 1 before any call. */
+int optind = 0;
+
+/* The next char to be scanned in the option-element
+ in which the last option character we returned was found.
+ This allows us to pick up the scan where we left off.
+
+ If this is zero, or a null string, it means resume the scan
+ by advancing to the next ARGV-element. */
+
+static char *nextchar;
+
+/* Callers store zero here to inhibit the error message
+ for unrecognized options. */
+
+int opterr = 1;
+
+/* Set to an option character which was unrecognized.
+ This must be initialized on some systems to avoid linking in the
+ system's own getopt implementation. */
+
+#define BAD_OPTION '\0'
+int optopt = BAD_OPTION;
+
+/* Describe how to deal with options that follow non-option ARGV-elements.
+
+ If the caller did not specify anything,
+ the default is REQUIRE_ORDER if the environment variable
+ POSIXLY_CORRECT is defined, PERMUTE otherwise.
+
+ REQUIRE_ORDER means don't recognize them as options;
+ stop option processing when the first non-option is seen.
+ This is what Unix does.
+ This mode of operation is selected by either setting the environment
+ variable POSIXLY_CORRECT, or using `+' as the first character
+ of the list of option characters.
+
+ PERMUTE is the default. We permute the contents of ARGV as we scan,
+ so that eventually all the non-options are at the end. This allows options
+ to be given in any order, even with programs that were not written to
+ expect this.
+
+ RETURN_IN_ORDER is an option available to programs that were written
+ to expect options and other ARGV-elements in any order and that care about
+ the ordering of the two. We describe each non-option ARGV-element
+ as if it were the argument of an option with character code 1.
+ Using `-' as the first character of the list of option characters
+ selects this mode of operation.
+
+ The special argument `--' forces an end of option-scanning regardless
+ of the value of `ordering'. In the case of RETURN_IN_ORDER, only
+ `--' can cause `getopt' to return EOF with `optind' != ARGC. */
+
+static enum
+{
+ REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER
+} ordering;
+
+static int
+my_strlen (str)
+ const char *str;
+{
+ int n = 0;
+ while (*str++)
+ n++;
+ return n;
+}
+
+static char *
+my_index (str, chr)
+ const char *str;
+ int chr;
+{
+ while (*str)
+ {
+ if (*str == chr)
+ return (char *) str;
+ str++;
+ }
+ return 0;
+}
+
+
+/* Handle permutation of arguments. */
+
+/* Describe the part of ARGV that contains non-options that have
+ been skipped. `first_nonopt' is the index in ARGV of the first of them;
+ `last_nonopt' is the index after the last of them. */
+
+static int first_nonopt;
+static int last_nonopt;
+
+/* Exchange two adjacent subsequences of ARGV.
+ One subsequence is elements [first_nonopt,last_nonopt)
+ which contains all the non-options that have been skipped so far.
+ The other is elements [last_nonopt,optind), which contains all
+ the options processed since those non-options were skipped.
+
+ `first_nonopt' and `last_nonopt' are relocated so that they describe
+ the new indices of the non-options in ARGV after they are moved.
+
+ To perform the swap, we first reverse the order of all elements. So
+ all options now come before all non options, but they are in the
+ wrong order. So we put back the options and non options in original
+ order by reversing them again. For example:
+ original input: a b c -x -y
+ reverse all: -y -x c b a
+ reverse options: -x -y c b a
+ reverse non options: -x -y a b c
+*/
+
+#if __STDC__ || defined(PROTO)
+static void exchange (char **argv);
+#endif
+
+static void
+exchange (argv)
+ char **argv;
+{
+ char *temp, **first, **last;
+
+ /* Reverse all the elements [first_nonopt, optind) */
+ first = &argv[first_nonopt];
+ last = &argv[optind-1];
+ while (first < last) {
+ temp = *first; *first = *last; *last = temp; first++; last--;
+ }
+ /* Put back the options in order */
+ first = &argv[first_nonopt];
+ first_nonopt += (optind - last_nonopt);
+ last = &argv[first_nonopt - 1];
+ while (first < last) {
+ temp = *first; *first = *last; *last = temp; first++; last--;
+ }
+
+ /* Put back the non options in order */
+ first = &argv[first_nonopt];
+ last_nonopt = optind;
+ last = &argv[last_nonopt-1];
+ while (first < last) {
+ temp = *first; *first = *last; *last = temp; first++; last--;
+ }
+}
+
+/* Scan elements of ARGV (whose length is ARGC) for option characters
+ given in OPTSTRING.
+
+ If an element of ARGV starts with '-', and is not exactly "-" or "--",
+ then it is an option element. The characters of this element
+ (aside from the initial '-') are option characters. If `getopt'
+ is called repeatedly, it returns successively each of the option characters
+ from each of the option elements.
+
+ If `getopt' finds another option character, it returns that character,
+ updating `optind' and `nextchar' so that the next call to `getopt' can
+ resume the scan with the following option character or ARGV-element.
+
+ If there are no more option characters, `getopt' returns `EOF'.
+ Then `optind' is the index in ARGV of the first ARGV-element
+ that is not an option. (The ARGV-elements have been permuted
+ so that those that are not options now come last.)
+
+ OPTSTRING is a string containing the legitimate option characters.
+ If an option character is seen that is not listed in OPTSTRING,
+ return BAD_OPTION after printing an error message. If you set `opterr' to
+ zero, the error message is suppressed but we still return BAD_OPTION.
+
+ If a char in OPTSTRING is followed by a colon, that means it wants an arg,
+ so the following text in the same ARGV-element, or the text of the following
+ ARGV-element, is returned in `optarg'. Two colons mean an option that
+ wants an optional arg; if there is text in the current ARGV-element,
+ it is returned in `optarg', otherwise `optarg' is set to zero.
+
+ If OPTSTRING starts with `-' or `+', it requests different methods of
+ handling the non-option ARGV-elements.
+ See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above.
+
+ Long-named options begin with `--' instead of `-'.
+ Their names may be abbreviated as long as the abbreviation is unique
+ or is an exact match for some defined option. If they have an
+ argument, it follows the option name in the same ARGV-element, separated
+ from the option name by a `=', or else the in next ARGV-element.
+ When `getopt' finds a long-named option, it returns 0 if that option's
+ `flag' field is nonzero, the value of the option's `val' field
+ if the `flag' field is zero.
+
+ The elements of ARGV aren't really const, because we permute them.
+ But we pretend they're const in the prototype to be compatible
+ with other systems.
+
+ LONGOPTS is a vector of `struct option' terminated by an
+ element containing a name which is zero.
+
+ LONGIND returns the index in LONGOPT of the long-named option found.
+ It is only valid when a long-named option has been found by the most
+ recent call.
+
+ If LONG_ONLY is nonzero, '-' as well as '--' can introduce
+ long-named options. */
+
+int
+_getopt_internal (argc, argv, optstring, longopts, longind, long_only)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+ const struct option *longopts;
+ int *longind;
+ int long_only;
+{
+ int option_index;
+
+ optarg = 0;
+
+ /* Initialize the internal data when the first call is made.
+ Start processing options with ARGV-element 1 (since ARGV-element 0
+ is the program name); the sequence of previously skipped
+ non-option ARGV-elements is empty. */
+
+ if (optind == 0)
+ {
+ first_nonopt = last_nonopt = optind = 1;
+
+ nextchar = NULL;
+
+ /* Determine how to handle the ordering of options and nonoptions. */
+
+ if (optstring[0] == '-')
+ {
+ ordering = RETURN_IN_ORDER;
+ ++optstring;
+ }
+ else if (optstring[0] == '+')
+ {
+ ordering = REQUIRE_ORDER;
+ ++optstring;
+ }
+ else if (getenv ("POSIXLY_CORRECT") != NULL)
+ ordering = REQUIRE_ORDER;
+ else
+ ordering = PERMUTE;
+ }
+
+ if (nextchar == NULL || *nextchar == '\0')
+ {
+ if (ordering == PERMUTE)
+ {
+ /* If we have just processed some options following some non-options,
+ exchange them so that the options come first. */
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (last_nonopt != optind)
+ first_nonopt = optind;
+
+ /* Now skip any additional non-options
+ and extend the range of non-options previously skipped. */
+
+ while (optind < argc
+ && (argv[optind][0] != '-' || argv[optind][1] == '\0')
+#ifdef GETOPT_COMPAT
+ && (longopts == NULL
+ || argv[optind][0] != '+' || argv[optind][1] == '\0')
+#endif /* GETOPT_COMPAT */
+ )
+ optind++;
+ last_nonopt = optind;
+ }
+
+ /* Special ARGV-element `--' means premature end of options.
+ Skip it like a null option,
+ then exchange with previous non-options as if it were an option,
+ then skip everything else like a non-option. */
+
+ if (optind != argc && !strcmp (argv[optind], "--"))
+ {
+ optind++;
+
+ if (first_nonopt != last_nonopt && last_nonopt != optind)
+ exchange ((char **) argv);
+ else if (first_nonopt == last_nonopt)
+ first_nonopt = optind;
+ last_nonopt = argc;
+
+ optind = argc;
+ }
+
+ /* If we have done all the ARGV-elements, stop the scan
+ and back over any non-options that we skipped and permuted. */
+
+ if (optind == argc)
+ {
+ /* Set the next-arg-index to point at the non-options
+ that we previously skipped, so the caller will digest them. */
+ if (first_nonopt != last_nonopt)
+ optind = first_nonopt;
+ return EOF;
+ }
+
+ /* If we have come to a non-option and did not permute it,
+ either stop the scan or describe it to the caller and pass it by. */
+
+ if ((argv[optind][0] != '-' || argv[optind][1] == '\0')
+#ifdef GETOPT_COMPAT
+ && (longopts == NULL
+ || argv[optind][0] != '+' || argv[optind][1] == '\0')
+#endif /* GETOPT_COMPAT */
+ )
+ {
+ if (ordering == REQUIRE_ORDER)
+ return EOF;
+ optarg = argv[optind++];
+ return 1;
+ }
+
+ /* We have found another option-ARGV-element.
+ Start decoding its characters. */
+
+ nextchar = (argv[optind] + 1
+ + (longopts != NULL && argv[optind][1] == '-'));
+ }
+
+ if (longopts != NULL
+ && ((argv[optind][0] == '-'
+ && (argv[optind][1] == '-' || long_only))
+#ifdef GETOPT_COMPAT
+ || argv[optind][0] == '+'
+#endif /* GETOPT_COMPAT */
+ ))
+ {
+ const struct option *p;
+ char *s = nextchar;
+ int exact = 0;
+ int ambig = 0;
+ const struct option *pfound = NULL;
+ int indfound = 0;
+
+ while (*s && *s != '=')
+ s++;
+
+ /* Test all options for either exact match or abbreviated matches. */
+ for (p = longopts, option_index = 0; p->name;
+ p++, option_index++)
+ if (!strncmp (p->name, nextchar, s - nextchar))
+ {
+ if (s - nextchar == my_strlen (p->name))
+ {
+ /* Exact match found. */
+ pfound = p;
+ indfound = option_index;
+ exact = 1;
+ break;
+ }
+ else if (pfound == NULL)
+ {
+ /* First nonexact match found. */
+ pfound = p;
+ indfound = option_index;
+ }
+ else
+ /* Second nonexact match found. */
+ ambig = 1;
+ }
+
+ if (ambig && !exact)
+ {
+ if (opterr)
+ fprintf (stderr, "%s: option `%s' is ambiguous\n",
+ argv[0], argv[optind]);
+ nextchar += my_strlen (nextchar);
+ optind++;
+ return BAD_OPTION;
+ }
+
+ if (pfound != NULL)
+ {
+ option_index = indfound;
+ optind++;
+ if (*s)
+ {
+ /* Don't test has_arg with >, because some C compilers don't
+ allow it to be used on enums. */
+ if (pfound->has_arg)
+ optarg = s + 1;
+ else
+ {
+ if (opterr)
+ {
+ if (argv[optind - 1][1] == '-')
+ /* --option */
+ fprintf (stderr,
+ "%s: option `--%s' doesn't allow an argument\n",
+ argv[0], pfound->name);
+ else
+ /* +option or -option */
+ fprintf (stderr,
+ "%s: option `%c%s' doesn't allow an argument\n",
+ argv[0], argv[optind - 1][0], pfound->name);
+ }
+ nextchar += my_strlen (nextchar);
+ return BAD_OPTION;
+ }
+ }
+ else if (pfound->has_arg == 1)
+ {
+ if (optind < argc)
+ optarg = argv[optind++];
+ else
+ {
+ if (opterr)
+ fprintf (stderr, "%s: option `%s' requires an argument\n",
+ argv[0], argv[optind - 1]);
+ nextchar += my_strlen (nextchar);
+ return optstring[0] == ':' ? ':' : BAD_OPTION;
+ }
+ }
+ nextchar += my_strlen (nextchar);
+ if (longind != NULL)
+ *longind = option_index;
+ if (pfound->flag)
+ {
+ *(pfound->flag) = pfound->val;
+ return 0;
+ }
+ return pfound->val;
+ }
+ /* Can't find it as a long option. If this is not getopt_long_only,
+ or the option starts with '--' or is not a valid short
+ option, then it's an error.
+ Otherwise interpret it as a short option. */
+ if (!long_only || argv[optind][1] == '-'
+#ifdef GETOPT_COMPAT
+ || argv[optind][0] == '+'
+#endif /* GETOPT_COMPAT */
+ || my_index (optstring, *nextchar) == NULL)
+ {
+ if (opterr)
+ {
+ if (argv[optind][1] == '-')
+ /* --option */
+ fprintf (stderr, "%s: unrecognized option `--%s'\n",
+ argv[0], nextchar);
+ else
+ /* +option or -option */
+ fprintf (stderr, "%s: unrecognized option `%c%s'\n",
+ argv[0], argv[optind][0], nextchar);
+ }
+ nextchar = (char *) "";
+ optind++;
+ return BAD_OPTION;
+ }
+ }
+
+ /* Look at and handle the next option-character. */
+
+ {
+ char c = *nextchar++;
+ char *temp = my_index (optstring, c);
+
+ /* Increment `optind' when we start to process its last character. */
+ if (*nextchar == '\0')
+ ++optind;
+
+ if (temp == NULL || c == ':')
+ {
+ if (opterr)
+ {
+#if 0
+ if (c < 040 || c >= 0177)
+ fprintf (stderr, "%s: unrecognized option, character code 0%o\n",
+ argv[0], c);
+ else
+ fprintf (stderr, "%s: unrecognized option `-%c'\n", argv[0], c);
+#else
+ /* 1003.2 specifies the format of this message. */
+ fprintf (stderr, "%s: illegal option -- %c\n", argv[0], c);
+#endif
+ }
+ optopt = c;
+ return BAD_OPTION;
+ }
+ if (temp[1] == ':')
+ {
+ if (temp[2] == ':')
+ {
+ /* This is an option that accepts an argument optionally. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ optind++;
+ }
+ else
+ optarg = 0;
+ nextchar = NULL;
+ }
+ else
+ {
+ /* This is an option that requires an argument. */
+ if (*nextchar != '\0')
+ {
+ optarg = nextchar;
+ /* If we end this ARGV-element by taking the rest as an arg,
+ we must advance to the next element now. */
+ optind++;
+ }
+ else if (optind == argc)
+ {
+ if (opterr)
+ {
+#if 0
+ fprintf (stderr, "%s: option `-%c' requires an argument\n",
+ argv[0], c);
+#else
+ /* 1003.2 specifies the format of this message. */
+ fprintf (stderr, "%s: option requires an argument -- %c\n",
+ argv[0], c);
+#endif
+ }
+ optopt = c;
+ if (optstring[0] == ':')
+ c = ':';
+ else
+ c = BAD_OPTION;
+ }
+ else
+ /* We already incremented `optind' once;
+ increment it again when taking next ARGV-elt as argument. */
+ optarg = argv[optind++];
+ nextchar = NULL;
+ }
+ }
+ return c;
+ }
+}
+
+int
+getopt (argc, argv, optstring)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+{
+ return _getopt_internal (argc, argv, optstring,
+ (const struct option *) 0,
+ (int *) 0,
+ 0);
+}
+
+int
+getopt_long (argc, argv, options, long_options, opt_index)
+ int argc;
+ char *const *argv;
+ const char *options;
+ const struct option *long_options;
+ int *opt_index;
+{
+ return _getopt_internal (argc, argv, options, long_options, opt_index, 0);
+}
+
+
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/getopt.h b/cesar/ecos/packages/kernel/current/tests_smp/getopt.h
new file mode 100644
index 0000000000..3c0b2930a6
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/getopt.h
@@ -0,0 +1,129 @@
+/* Declarations for getopt.
+ Copyright (C) 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+//#define printf diag_printf
+
+#ifndef _GETOPT_H
+#define _GETOPT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+ When `getopt' finds an option that takes an argument,
+ the argument value is returned here.
+ Also, when `ordering' is RETURN_IN_ORDER,
+ each non-option ARGV-element is returned here. */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+ This is used for communication to and from the caller
+ and for communication between successive calls to `getopt'.
+
+ On entry to `getopt', zero means this is the first call; initialize.
+
+ When `getopt' returns EOF, this is the index of the first of the
+ non-option elements that the caller should itself scan.
+
+ Otherwise, `optind' communicates from one call to the next
+ how much of ARGV has been scanned so far. */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+ for unrecognized options. */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized. */
+
+extern int optopt;
+
+/* Describe the long-named options requested by the application.
+ The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+ of `struct option' terminated by an element containing a name which is
+ zero.
+
+ The field `has_arg' is:
+ no_argument (or 0) if the option does not take an argument,
+ required_argument (or 1) if the option requires an argument,
+ optional_argument (or 2) if the option takes an optional argument.
+
+ If the field `flag' is not NULL, it points to a variable that is set
+ to the value given in the field `val' when the option is found, but
+ left unchanged if the option is not found.
+
+ To have a long-named option do something other than set an `int' to
+ a compiled-in constant, such as set a value from `optarg', set the
+ option's `flag' field to zero and its `val' field to a nonzero
+ value (the equivalent single-letter option character, if there is
+ one). For long options that have a zero `flag' field, `getopt'
+ returns the contents of the `val' field. */
+
+struct option
+{
+#if __STDC__
+ const char *name;
+#else
+ char *name;
+#endif
+ /* has_arg can't be an enum because some compilers complain about
+ type mismatches in all the code that assumes it is an int. */
+ int has_arg;
+ int *flag;
+ int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'. */
+
+#define no_argument 0
+#define required_argument 1
+#define optional_argument 2
+
+#if __STDC__ || defined(PROTO)
+#if defined(__GNU_LIBRARY__)
+/* Many other libraries have conflicting prototypes for getopt, with
+ differences in the consts, in stdlib.h. To avoid compilation
+ errors, only prototype getopt for the GNU C library. */
+extern int getopt (int argc, char *const *argv, const char *shortopts);
+#endif /* not __GNU_LIBRARY__ */
+extern int getopt_long (int argc, char *const *argv, const char *shortopts,
+ const struct option *longopts, int *longind);
+extern int getopt_long_only (int argc, char *const *argv,
+ const char *shortopts,
+ const struct option *longopts, int *longind);
+
+/* Internal only. Users should not call this directly. */
+extern int _getopt_internal (int argc, char *const *argv,
+ const char *shortopts,
+ const struct option *longopts, int *longind,
+ int long_only);
+#else /* not __STDC__ */
+extern int getopt ();
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+#endif /* not __STDC__ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GETOPT_H */
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/lu.C b/cesar/ecos/packages/kernel/current/tests_smp/lu.C
new file mode 100644
index 0000000000..9a46678260
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/lu.C
@@ -0,0 +1,788 @@
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Parallel dense blocked LU factorization (no pivoting) */
+/* */
+/* This version contains one dimensional arrays in which the matrix */
+/* to be factored is stored. */
+/* */
+/* Command line options: */
+/* */
+/* -nN : Decompose NxN matrix. */
+/* -pP : P = number of processors. */
+/* -bB : Use a block size of B. BxB elements should fit in cache for */
+/* good performance. Small block sizes (B=8, B=16) work well. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Test output. */
+/* -o : Print out matrix values. */
+/* -h : Print out command line options. */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <tests_smp/getopt.h>
+
+#include <tests_smp/lu_arg.c>
+
+MAIN_ENV
+
+#define MAXRAND 2147483647
+#define DEFAULT_N 128
+#define DEFAULT_P 1
+#define DEFAULT_B 16
+#define min(a,b) ((a) < (b) ? (a) : (b))
+
+struct GlobalMemory {
+ double *t_in_fac;
+ double *t_in_solve;
+ double *t_in_mod;
+ double *t_in_bar;
+ double *completion;
+ unsigned int starttime;
+ unsigned int rf;
+ unsigned int rs;
+ unsigned int done;
+ int id;
+ BARDEC(start)
+ LOCKDEC(idlock)
+} *Global;
+
+struct LocalCopies {
+ double t_in_fac;
+ double t_in_solve;
+ double t_in_mod;
+ double t_in_bar;
+};
+
+int dbg_on = 0;
+int n = DEFAULT_N; /* The size of the matrix */
+int P = DEFAULT_P; /* Number of processors */
+int block_size = DEFAULT_B; /* Block dimension */
+int nblocks; /* Number of blocks in each dimension */
+int num_rows; /* Number of processors per row of processor grid */
+int num_cols; /* Number of processors per col of processor grid */
+double *a; /* a = lu; l and u both placed back in a */
+double *rhs;
+int *proc_bytes; /* Bytes to malloc per processor to hold blocks
+ of A*/
+int test_result = 0; /* Test result of factorization? */
+int doprint = 0; /* Print out matrix values? */
+int dostats = 0; /* Print out individual processor statistics? */
+
+void SlaveStart();
+void OneSolve(int, int, double *, int, int);
+void lu0(double *,int, int);
+void bdiv(double *, double *, int, int, int, int);
+void bmodd(double *, double*, int, int, int, int);
+void bmod(double *, double *, double *, int, int, int, int);
+void daxpy(double *, double *, int, double);
+int BlockOwner(int, int);
+void lu(int, int, int, struct LocalCopies *, int);
+void InitA(double *);
+double TouchA(int, int);
+void PrintA();
+void CheckResult(int, double *, double *);
+void printerr(char *);
+volatile cyg_thread *thread_array[64];
+
+
+int smp_cyg_test_main(argc, argv)
+
+int argc;
+char **argv;
+
+{
+ int i, j;
+ int ch;
+ extern char *optarg;
+ int MyNum=0;
+ double mint, maxt, avgt;
+ double min_fac, min_solve, min_mod, min_bar;
+ double max_fac, max_solve, max_mod, max_bar;
+ double avg_fac, avg_solve, avg_mod, avg_bar;
+ int proc_num;
+ unsigned int start;
+
+ memset (thread_array,0,sizeof(thread_array));
+
+ CLOCK(start);
+
+ while ((ch = getopt(argc, argv, "n:p:b:cstoh")) != -1) {
+ switch(ch) {
+ case 'n': n = atoi(optarg); break;
+ case 'p': P = atoi(optarg);
+ P = HAL_SMP_CPU_MAX; break;
+ case 'b': block_size = atoi(optarg); break;
+ case 's': dostats = 1; break;
+ case 't': test_result = !test_result; break;
+ case 'o': doprint = !doprint; break;
+ case 'h': diag_printf("Usage: LU <options>\n\n");
+ diag_printf("options:\n");
+ diag_printf(" -nN : Decompose NxN matrix.\n");
+ diag_printf(" -pP : P = number of processors.\n");
+ diag_printf(" -bB : Use a block size of B. BxB elements should fit in cache for \n");
+ diag_printf(" good performance. Small block sizes (B=8, B=16) work well.\n");
+ diag_printf(" -c : Copy non-locally allocated blocks to local memory before use.\n");
+ diag_printf(" -s : Print individual processor timing statistics.\n");
+ diag_printf(" -t : Test output.\n");
+ diag_printf(" -o : Print out matrix values.\n");
+ diag_printf(" -h : Print out command line options.\n\n");
+ diag_printf("Default: LU -n%1d -p%1d -b%1d\n",
+ DEFAULT_N,DEFAULT_P,DEFAULT_B);
+ exit(0);
+ break;
+ }
+ }
+ if (P > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ MAIN_INITENV(,150000000)
+
+ diag_printf("\n");
+ diag_printf("Blocked Dense LU Factorization\n");
+ diag_printf(" %d by %d Matrix\n",n,n);
+ diag_printf(" %d Processors\n",P);
+ diag_printf(" %d by %d Element Blocks\n",block_size,block_size);
+ diag_printf("\n");
+ diag_printf("\n");
+
+ num_rows = (int) sqrt((double) P);
+ for (;;) {
+ num_cols = P/num_rows;
+ if (num_rows*num_cols == P)
+ break;
+ num_rows--;
+ }
+ nblocks = n/block_size;
+ if (block_size * nblocks != n) {
+ nblocks++;
+ }
+
+ a = (double *) G_MALLOC(n*n*sizeof(double));
+ rhs = (double *) G_MALLOC(n*sizeof(double));
+
+ Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory));
+ Global->t_in_fac = (double *) G_MALLOC(P*sizeof(double));
+ Global->t_in_mod = (double *) G_MALLOC(P*sizeof(double));
+ Global->t_in_solve = (double *) G_MALLOC(P*sizeof(double));
+ Global->t_in_bar = (double *) G_MALLOC(P*sizeof(double));
+ Global->completion = (double *) G_MALLOC(P*sizeof(double));
+
+ if (Global == NULL) {
+ printerr("Could not malloc memory for Global\n");
+ exit(-1);
+ } else if (Global->t_in_fac == NULL) {
+ printerr("Could not malloc memory for Global->t_in_fac\n");
+ exit(-1);
+ } else if (Global->t_in_mod == NULL) {
+ printerr("Could not malloc memory for Global->t_in_mod\n");
+ exit(-1);
+ } else if (Global->t_in_solve == NULL) {
+ printerr("Could not malloc memory for Global->t_in_solve\n");
+ exit(-1);
+ } else if (Global->t_in_bar == NULL) {
+ printerr("Could not malloc memory for Global->t_in_bar\n");
+ exit(-1);
+ } else if (Global->completion == NULL) {
+ printerr("Could not malloc memory for Global->completion\n");
+ exit(-1);
+ }
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the a
+ matrix data across physically distributed memories in a
+ round-robin fashion as desired. */
+
+ BARINIT(Global->start);
+ LOCKINIT(Global->idlock);
+ Global->id = 0;
+
+ for (i=1; i<P; i++) {
+ CREATE(SlaveStart,thread_array[i])
+ }
+
+ InitA(rhs);
+ if (doprint) {
+ diag_printf("Matrix before decomposition:\n");
+ PrintA();
+ }
+
+
+ SlaveStart(MyNum);
+
+ for (i=1; i<P; i++) {
+ while(thread_array[i]) {};
+ }
+
+ if (doprint) {
+ diag_printf("\nMatrix after decomposition:\n");
+ PrintA();
+ }
+
+ if (dostats) {
+ maxt = avgt = mint = Global->completion[0];
+ for (i=1; i<P; i++) {
+ if (Global->completion[i] > maxt) {
+ maxt = Global->completion[i];
+ }
+ if (Global->completion[i] < mint) {
+ mint = Global->completion[i];
+ }
+ avgt += Global->completion[i];
+ }
+ avgt = avgt / P;
+
+ min_fac = max_fac = avg_fac = Global->t_in_fac[0];
+ min_solve = max_solve = avg_solve = Global->t_in_solve[0];
+ min_mod = max_mod = avg_mod = Global->t_in_mod[0];
+ min_bar = max_bar = avg_bar = Global->t_in_bar[0];
+
+ for (i=1; i<P; i++) {
+ if (Global->t_in_fac[i] > max_fac) {
+ max_fac = Global->t_in_fac[i];
+ }
+ if (Global->t_in_fac[i] < min_fac) {
+ min_fac = Global->t_in_fac[i];
+ }
+ if (Global->t_in_solve[i] > max_solve) {
+ max_solve = Global->t_in_solve[i];
+ }
+ if (Global->t_in_solve[i] < min_solve) {
+ min_solve = Global->t_in_solve[i];
+ }
+ if (Global->t_in_mod[i] > max_mod) {
+ max_mod = Global->t_in_mod[i];
+ }
+ if (Global->t_in_mod[i] < min_mod) {
+ min_mod = Global->t_in_mod[i];
+ }
+ if (Global->t_in_bar[i] > max_bar) {
+ max_bar = Global->t_in_bar[i];
+ }
+ if (Global->t_in_bar[i] < min_bar) {
+ min_bar = Global->t_in_bar[i];
+ }
+ avg_fac += Global->t_in_fac[i];
+ avg_solve += Global->t_in_solve[i];
+ avg_mod += Global->t_in_mod[i];
+ avg_bar += Global->t_in_bar[i];
+ }
+ avg_fac = avg_fac/P;
+ avg_solve = avg_solve/P;
+ avg_mod = avg_mod/P;
+ avg_bar = avg_bar/P;
+ }
+ diag_printf(" PROCESS STATISTICS\n");
+ diag_printf(" Total Diagonal Perimeter Interior Barrier\n");
+ diag_printf(" Proc Time Time Time Time Time\n");
+ diag_printf(" 0 %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ Global->completion[0],Global->t_in_fac[0],
+ Global->t_in_solve[0],Global->t_in_mod[0],
+ Global->t_in_bar[0]);
+ if (dostats) {
+ for (i=1; i<P; i++) {
+ diag_printf(" %3d %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ i,Global->completion[i],Global->t_in_fac[i],
+ Global->t_in_solve[i],Global->t_in_mod[i],
+ Global->t_in_bar[i]);
+ }
+ diag_printf(" Avg %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ avgt,avg_fac,avg_solve,avg_mod,avg_bar);
+ diag_printf(" Min %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ mint,min_fac,min_solve,min_mod,min_bar);
+ diag_printf(" Max %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ maxt,max_fac,max_solve,max_mod,max_bar);
+ }
+ diag_printf("\n");
+ Global->starttime = start;
+ diag_printf(" TIMING INFORMATION\n");
+ diag_printf("Start time : %16d\n",
+ Global->starttime);
+ diag_printf("Initialization finish time : %16d\n",
+ Global->rs);
+ diag_printf("Overall finish time : %16d\n",
+ Global->rf);
+ diag_printf("Total time with initialization : %16d\n",
+ Global->rf-Global->starttime);
+ diag_printf("Total time without initialization : %16d\n",
+ Global->rf-Global->rs);
+ diag_printf("\n");
+
+ if (test_result) {
+ diag_printf(" TESTING RESULTS\n");
+ CheckResult(n, a, rhs);
+ }
+
+ MAIN_END;
+}
+
+void SlaveStart()
+
+{
+ int i;
+ int j;
+ int cluster;
+ int max_block;
+ int MyNum;
+
+ LOCK(Global->idlock)
+ MyNum = Global->id;
+ Global->id ++;
+ UNLOCK(Global->idlock)
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ diag_printf("Slave start\n");
+
+ OneSolve(n, block_size, a, MyNum, dostats);
+
+ thread_array[MyNum] = 0;
+
+}
+
+
+void OneSolve(n, block_size, a, MyNum, dostats)
+
+double *a;
+int n;
+int block_size;
+int MyNum;
+int dostats;
+
+{
+ unsigned int i;
+ unsigned int myrs;
+ unsigned int myrf;
+ unsigned int mydone;
+ struct LocalCopies *lc;
+
+ lc = (struct LocalCopies *) malloc(sizeof(struct LocalCopies));
+ if (lc == NULL) {
+ diag_printf("Proc %d could not malloc memory for lc\n",MyNum);
+ exit(-1);
+ }
+ lc->t_in_fac = 0.0;
+ lc->t_in_solve = 0.0;
+ lc->t_in_mod = 0.0;
+ lc->t_in_bar = 0.0;
+
+ /* barrier to ensure all initialization is done */
+ BARRIER(Global->start, P, 1);
+
+ /* to remove cold-start misses, all processors begin by touching a[] */
+ TouchA(block_size, MyNum);
+
+ BARRIER(Global->start, P, 2);
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(myrs);
+ }
+
+ diag_printf("Start calc\n");
+ lu(n, block_size, MyNum, lc, dostats);
+ diag_printf("End calc\n");
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(mydone);
+ }
+
+ BARRIER(Global->start, P, 3);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(myrf);
+ Global->t_in_fac[MyNum] = lc->t_in_fac;
+ Global->t_in_solve[MyNum] = lc->t_in_solve;
+ Global->t_in_mod[MyNum] = lc->t_in_mod;
+ Global->t_in_bar[MyNum] = lc->t_in_bar;
+ Global->completion[MyNum] = mydone-myrs;
+ }
+ if (MyNum == 0) {
+ Global->rs = myrs;
+ Global->done = mydone;
+ Global->rf = myrf;
+ }
+}
+
+
+void lu0(a, n, stride)
+
+double *a;
+int n;
+int stride;
+
+{
+ int j;
+ int k;
+ int length;
+ double alpha;
+
+ for (k=0; k<n; k++) {
+ /* modify subsequent columns */
+ for (j=k+1; j<n; j++) {
+ a[k+j*stride] /= a[k+k*stride];
+ alpha = -a[k+j*stride];
+ length = n-k-1;
+ daxpy(&a[k+1+j*stride], &a[k+1+k*stride], n-k-1, alpha);
+ }
+ }
+}
+
+
+void bdiv(a, diag, stride_a, stride_diag, dimi, dimk)
+
+double *a;
+double *diag;
+int stride_a;
+int stride_diag;
+int dimi;
+int dimk;
+
+{
+ int j;
+ int k;
+ double alpha;
+
+ for (k=0; k<dimk; k++) {
+ for (j=k+1; j<dimk; j++) {
+ alpha = -diag[k+j*stride_diag];
+ daxpy(&a[j*stride_a], &a[k*stride_a], dimi, alpha);
+ }
+ }
+}
+
+
+void bmodd(a, c, dimi, dimj, stride_a, stride_c)
+
+double *a;
+double *c;
+int dimi;
+int dimj;
+int stride_a;
+int stride_c;
+
+{
+ int i;
+ int j;
+ int k;
+ int length;
+ double alpha;
+
+ for (k=0; k<dimi; k++)
+ for (j=0; j<dimj; j++) {
+ c[k+j*stride_c] /= a[k+k*stride_a];
+ alpha = -c[k+j*stride_c];
+ length = dimi - k - 1;
+ daxpy(&c[k+1+j*stride_c], &a[k+1+k*stride_a], dimi-k-1, alpha);
+ }
+}
+
+
+void bmod(a, b, c, dimi, dimj, dimk, stride)
+
+double *a;
+double *b;
+double *c;
+int dimi;
+int dimj;
+int dimk;
+int stride;
+
+{
+ int i;
+ int j;
+ int k;
+ double alpha;
+
+ for (k=0; k<dimk; k++) {
+ for (j=0; j<dimj; j++) {
+ alpha = -b[k+j*stride];
+ daxpy(&c[j*stride], &a[k*stride], dimi, alpha);
+ }
+ }
+}
+
+
+void daxpy(a, b, n, alpha)
+
+double *a;
+double *b;
+double alpha;
+int n;
+
+{
+ int i;
+
+ for (i=0; i<n; i++) {
+ a[i] += alpha*b[i];
+ }
+}
+
+
+int BlockOwner(I, J)
+
+int I;
+int J;
+
+{
+ return((I%num_cols) + (J%num_rows)*num_cols);
+}
+
+
+void lu(n, bs, MyNum, lc, dostats)
+
+int n;
+int bs;
+int MyNum;
+struct LocalCopies *lc;
+int dostats;
+
+{
+ int i, il, j, jl, k, kl;
+ int I, J, K;
+ double *A, *B, *C, *D;
+ int dimI, dimJ, dimK;
+ int strI;
+ unsigned int t1, t2, t3, t4, t11, t22;
+ int diagowner;
+ int colowner;
+
+ strI = n;
+ for (k=0, K=0; k<n; k+=bs, K++) {
+ kl = k+bs;
+ if (kl>n) {
+ kl = n;
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(t1);
+ }
+
+ /* factor diagonal block */
+ diagowner = BlockOwner(K, K);
+ if (diagowner == MyNum) {
+ A = &(a[k+k*n]);
+ lu0(A, kl-k, strI);
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(t11);
+ }
+
+ BARRIER(Global->start, P, 4);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(t2);
+ }
+
+ /* divide column k by diagonal block */
+ D = &(a[k+k*n]);
+ for (i=kl, I=K+1; i<n; i+=bs, I++) {
+ if (BlockOwner(I, K) == MyNum) { /* parcel out blocks */
+ il = i + bs;
+ if (il > n) {
+ il = n;
+ }
+ A = &(a[i+k*n]);
+ bdiv(A, D, strI, n, il-i, kl-k);
+ }
+ }
+ /* modify row k by diagonal block */
+ for (j=kl, J=K+1; j<n; j+=bs, J++) {
+ if (BlockOwner(K, J) == MyNum) { /* parcel out blocks */
+ jl = j+bs;
+ if (jl > n) {
+ jl = n;
+ }
+ A = &(a[k+j*n]);
+ bmodd(D, A, kl-k, jl-j, n, strI);
+ }
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(t22);
+ }
+
+ BARRIER(Global->start, P, 5);
+
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(t3);
+ }
+
+ /* modify subsequent block columns */
+ for (i=kl, I=K+1; i<n; i+=bs, I++) {
+ il = i+bs;
+ if (il > n) {
+ il = n;
+ }
+ colowner = BlockOwner(I,K);
+ A = &(a[i+k*n]);
+ for (j=kl, J=K+1; j<n; j+=bs, J++) {
+ jl = j + bs;
+ if (jl > n) {
+ jl = n;
+ }
+ if (BlockOwner(I, J) == MyNum) { /* parcel out blocks */
+ B = &(a[k+j*n]);
+ C = &(a[i+j*n]);
+ bmod(A, B, C, il-i, jl-j, kl-k, n);
+ }
+ }
+ }
+ if ((MyNum == 0) || (dostats)) {
+ CLOCK(t4);
+ lc->t_in_fac += (t11-t1);
+ lc->t_in_solve += (t22-t2);
+ lc->t_in_mod += (t4-t3);
+ lc->t_in_bar += (t2-t11) + (t3-t22);
+ }
+ }
+}
+
+
+void InitA(rhs)
+
+double *rhs;
+
+{
+ int i, j;
+
+ srand(1);
+ for (j=0; j<n; j++) {
+ for (i=0; i<n; i++) {
+ a[i+j*n] = (double) rand()/MAXRAND;
+ if (i == j) {
+ a[i+j*n] *= 10;
+ }
+ }
+ }
+
+ for (j=0; j<n; j++) {
+ rhs[j] = 0.0;
+ }
+ for (j=0; j<n; j++) {
+ for (i=0; i<n; i++) {
+ rhs[i] += a[i+j*n];
+ }
+ }
+}
+
+
+double TouchA(bs, MyNum)
+
+int bs;
+int MyNum;
+
+{
+ int i, j, I, J;
+ double tot = 0.0;
+
+ for (J=0; J*bs<n; J++) {
+ for (I=0; I*bs<n; I++) {
+ if (BlockOwner(I, J) == MyNum) {
+ for (j=J*bs; j<(J+1)*bs && j<n; j++) {
+ for (i=I*bs; i<(I+1)*bs && i<n; i++) {
+ tot += a[i+j*n];
+ }
+ }
+ }
+ }
+ }
+ return(tot);
+}
+
+
+void PrintA()
+{
+ int i, j;
+
+ for (i=0; i<n; i++) {
+ for (j=0; j<n; j++) {
+ diag_printf("%8.1f ", a[i+j*n]);
+ }
+ diag_printf("\n");
+ }
+}
+
+
+void CheckResult(n, a, rhs)
+
+int n;
+double *a;
+double *rhs;
+
+{
+ int i, j, bogus = 0;
+ double *y, diff, max_diff;
+
+ y = (double *) malloc(n*sizeof(double));
+ if (y == NULL) {
+ printerr("Could not malloc memory for y\n");
+ exit(-1);
+ }
+ for (j=0; j<n; j++) {
+ y[j] = rhs[j];
+ }
+ for (j=0; j<n; j++) {
+ y[j] = y[j]/a[j+j*n];
+ for (i=j+1; i<n; i++) {
+ y[i] -= a[i+j*n]*y[j];
+ }
+ }
+
+ for (j=n-1; j>=0; j--) {
+ for (i=0; i<j; i++) {
+ y[i] -= a[i+j*n]*y[j];
+ }
+ }
+
+ max_diff = 0.0;
+ for (j=0; j<n; j++) {
+ diff = y[j] - 1.0;
+ if (fabs(diff) > 0.00001) {
+ bogus = 1;
+ max_diff = diff;
+ }
+ }
+ if (bogus) {
+ diag_printf("TEST FAILED: (%.5f diff)\n", max_diff);
+ } else {
+ diag_printf("TEST PASSED\n");
+ }
+ free(y);
+}
+
+
+void printerr(s)
+
+char *s;
+
+{
+ diag_printf("ERROR: %s\n",s);
+}
+
+
+#include <tests_smp/getopt.c>
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/lu.c b/cesar/ecos/packages/kernel/current/tests_smp/lu.c
new file mode 100644
index 0000000000..9527c41ab3
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/lu.c
@@ -0,0 +1,1118 @@
+
+
+
+
+
+
+
+
+
+
+
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Parallel dense blocked LU factorization (no pivoting) */
+/* */
+/* This version contains one dimensional arrays in which the matrix */
+/* to be factored is stored. */
+/* */
+/* Command line options: */
+/* */
+/* -nN : Decompose NxN matrix. */
+/* -pP : P = number of processors. */
+/* -bB : Use a block size of B. BxB elements should fit in cache for */
+/* good performance. Small block sizes (B=8, B=16) work well. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Test output. */
+/* -o : Print out matrix values. */
+/* -h : Print out command line options. */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <tests_smp/getopt.h>
+
+#include <tests_smp/lu_arg.c>
+
+
+/*---------------------------------------*/
+#include <pkgconf/system.h>
+#include <pkgconf/infra.h>
+#include <pkgconf/kernel.h>
+#include <cyg/infra/cyg_type.h>
+#include <cyg/infra/testcase.h>
+#include <pkgconf/isoinfra.h>
+#include <cyg/hal/hal_cache.h>
+#include <cyg/hal/hal_smp.h>
+#include <cyg/kernel/kapi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+int smp_cyg_test_main(int argc, char **argv);
+
+void smp_cyg_test_main_call(void *p) {
+ smp_cyg_test_main(smp_cyg_test_argc, smp_cyg_test_argv);
+}
+
+
+/* #define STACK_SIZE 0x1b000 */
+#define STACK_SIZE 8192
+static cyg_thread smp_cyg_test_thread;
+static char smp_cyg_test_stack[STACK_SIZE];
+static cyg_handle_t smp_cyg_test_threadh;
+
+
+
+cyg_handle_t threadh[64];
+char *stack[64];
+cyg_thread thread[64];
+
+
+externC void cyg_user_start( void )
+{
+ CYG_TEST_INIT();
+
+ diag_printf("Starting test app\n");
+
+ cyg_thread_create(10, // Priority - just a number
+ smp_cyg_test_main_call, // entry
+ 0, // index
+ "smp test", // Name
+ smp_cyg_test_stack, // Stack
+ STACK_SIZE, // Size
+ &smp_cyg_test_threadh, // Handle
+ &smp_cyg_test_thread // Thread data structure
+ );
+ cyg_thread_resume( smp_cyg_test_threadh );
+ /*cyg_scheduler_start();*/
+}
+
+
+/*---------------------------------------*/
+
+
+#define MAXRAND 2147483647
+#define DEFAULT_N 128
+#define DEFAULT_P 1
+#define DEFAULT_B 16
+#define min(a,b) ((a) < (b) ? (a) : (b))
+
+struct GlobalMemory {
+ double *t_in_fac;
+ double *t_in_solve;
+ double *t_in_mod;
+ double *t_in_bar;
+ double *completion;
+ unsigned int starttime;
+ unsigned int rf;
+ unsigned int rs;
+ unsigned int done;
+ int id;
+
+/*---------------------------------------*/
+struct startTYP {
+ cyg_spinlock_t lock;;
+ int count[1];
+ cyg_sem_t queue[1];;
+
+ } start[1];
+/*---------------------------------------*/
+
+
+ cyg_spinlock_t idlock;
+} *Global;
+
+struct LocalCopies {
+ double t_in_fac;
+ double t_in_solve;
+ double t_in_mod;
+ double t_in_bar;
+};
+
+int dbg_on = 0;
+int n = DEFAULT_N; /* The size of the matrix */
+int P = DEFAULT_P; /* Number of processors */
+int block_size = DEFAULT_B; /* Block dimension */
+int nblocks; /* Number of blocks in each dimension */
+int num_rows; /* Number of processors per row of processor grid */
+int num_cols; /* Number of processors per col of processor grid */
+double *a; /* a = lu; l and u both placed back in a */
+double *rhs;
+int *proc_bytes; /* Bytes to malloc per processor to hold blocks
+ of A*/
+int test_result = 0; /* Test result of factorization? */
+int doprint = 0; /* Print out matrix values? */
+int dostats = 0; /* Print out individual processor statistics? */
+
+void SlaveStart();
+void OneSolve(int, int, double *, int, int);
+void lu0(double *,int, int);
+void bdiv(double *, double *, int, int, int, int);
+void bmodd(double *, double*, int, int, int, int);
+void bmod(double *, double *, double *, int, int, int, int);
+void daxpy(double *, double *, int, double);
+int BlockOwner(int, int);
+void lu(int, int, int, struct LocalCopies *, int);
+void InitA(double *);
+double TouchA(int, int);
+void PrintA();
+void CheckResult(int, double *, double *);
+void printerr(char *);
+volatile cyg_thread *thread_array[64];
+
+
+int smp_cyg_test_main(argc, argv)
+
+int argc;
+char **argv;
+
+{
+ int i, j;
+ int ch;
+ extern char *optarg;
+ int MyNum=0;
+ double mint, maxt, avgt;
+ double min_fac, min_solve, min_mod, min_bar;
+ double max_fac, max_solve, max_mod, max_bar;
+ double avg_fac, avg_solve, avg_mod, avg_bar;
+ int proc_num;
+ unsigned int start;
+
+ memset (thread_array,0,sizeof(thread_array));
+
+ {
+ (start) = cyg_current_time()*10;
+}
+;
+
+ while ((ch = getopt(argc, argv, "n:p:b:cstoh")) != -1) {
+ switch(ch) {
+ case 'n': n = atoi(optarg); break;
+ case 'p': P = atoi(optarg);
+ P = HAL_SMP_CPU_MAX; break;
+ case 'b': block_size = atoi(optarg); break;
+ case 's': dostats = 1; break;
+ case 't': test_result = !test_result; break;
+ case 'o': doprint = !doprint; break;
+ case 'h': diag_printf("Usage: LU <options>\n\n");
+ diag_printf("options:\n");
+ diag_printf(" -nN : Decompose NxN matrix.\n");
+ diag_printf(" -pP : P = number of processors.\n");
+ diag_printf(" -bB : Use a block size of B. BxB elements should fit in cache for \n");
+ diag_printf(" good performance. Small block sizes (B=8, B=16) work well.\n");
+ diag_printf(" -c : Copy non-locally allocated blocks to local memory before use.\n");
+ diag_printf(" -s : Print individual processor timing statistics.\n");
+ diag_printf(" -t : Test output.\n");
+ diag_printf(" -o : Print out matrix values.\n");
+ diag_printf(" -h : Print out command line options.\n\n");
+ diag_printf("Default: LU -n%1d -p%1d -b%1d\n",
+ DEFAULT_N,DEFAULT_P,DEFAULT_B);
+ exit(0);
+ break;
+ }
+ }
+ if (P > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ {;}
+
+ diag_printf("\n");
+ diag_printf("Blocked Dense LU Factorization\n");
+ diag_printf(" %d by %d Matrix\n",n,n);
+ diag_printf(" %d Processors\n",P);
+ diag_printf(" %d by %d Element Blocks\n",block_size,block_size);
+ diag_printf("\n");
+ diag_printf("\n");
+
+ num_rows = (int) sqrt((double) P);
+ for (;;) {
+ num_cols = P/num_rows;
+ if (num_rows*num_cols == P)
+ break;
+ num_rows--;
+ }
+ nblocks = n/block_size;
+ if (block_size * nblocks != n) {
+ nblocks++;
+ }
+
+ a = (double *) calloc(n*n*sizeof(double), 1);;
+ rhs = (double *) calloc(n*sizeof(double), 1);;
+
+ Global = (struct GlobalMemory *) calloc(sizeof(struct GlobalMemory), 1);;
+ Global->t_in_fac = (double *) calloc(P*sizeof(double), 1);;
+ Global->t_in_mod = (double *) calloc(P*sizeof(double), 1);;
+ Global->t_in_solve = (double *) calloc(P*sizeof(double), 1);;
+ Global->t_in_bar = (double *) calloc(P*sizeof(double), 1);;
+ Global->completion = (double *) calloc(P*sizeof(double), 1);;
+
+ if (Global == NULL) {
+ printerr("Could not malloc memory for Global\n");
+ exit(-1);
+ } else if (Global->t_in_fac == NULL) {
+ printerr("Could not malloc memory for Global->t_in_fac\n");
+ exit(-1);
+ } else if (Global->t_in_mod == NULL) {
+ printerr("Could not malloc memory for Global->t_in_mod\n");
+ exit(-1);
+ } else if (Global->t_in_solve == NULL) {
+ printerr("Could not malloc memory for Global->t_in_solve\n");
+ exit(-1);
+ } else if (Global->t_in_bar == NULL) {
+ printerr("Could not malloc memory for Global->t_in_bar\n");
+ exit(-1);
+ } else if (Global->completion == NULL) {
+ printerr("Could not malloc memory for Global->completion\n");
+ exit(-1);
+ }
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the a
+ matrix data across physically distributed memories in a
+ round-robin fashion as desired. */
+
+ {{
+/*---------------------------------------*/
+ int mon_dum1,mon_dum2;
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++)
+ for (mon_dum2=0; mon_dum2 < 1; mon_dum2++) {
+ Global->start[mon_dum1].count[mon_dum2] = 0;
+ { cyg_semaphore_init(&Global->start[mon_dum1].queue[mon_dum2],0);};
+ }
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++) {
+ cyg_spinlock_init( &Global->start[mon_dum1].lock,0 );;
+ }
+/*---------------------------------------*/
+}};
+ cyg_spinlock_init( &Global->idlock,0 );;
+ Global->id = 0;
+
+ for (i=1; i<P; i++) {
+
+{
+ /*---------------------------------------*/
+ stack[i] = calloc(STACK_SIZE, 1);
+ cyg_thread_create(10, // Priority - just a number
+ SlaveStart, // entry
+ 0, // index
+ "slave", // Name
+ stack[i], // Stack
+ STACK_SIZE, // Size
+ &threadh[i], // Handle
+ &thread[i] // Thread data structure
+ );
+ cyg_thread_resume( threadh[i] );
+ /*---------------------------------------*/
+}
+
+ }
+
+ InitA(rhs);
+ if (doprint) {
+ diag_printf("Matrix before decomposition:\n");
+ PrintA();
+ }
+
+
+ SlaveStart(MyNum);
+
+ for (i=1; i<P; i++) {
+ while(thread_array[i]) {};
+ }
+
+ if (doprint) {
+ diag_printf("\nMatrix after decomposition:\n");
+ PrintA();
+ }
+
+ if (dostats) {
+ maxt = avgt = mint = Global->completion[0];
+ for (i=1; i<P; i++) {
+ if (Global->completion[i] > maxt) {
+ maxt = Global->completion[i];
+ }
+ if (Global->completion[i] < mint) {
+ mint = Global->completion[i];
+ }
+ avgt += Global->completion[i];
+ }
+ avgt = avgt / P;
+
+ min_fac = max_fac = avg_fac = Global->t_in_fac[0];
+ min_solve = max_solve = avg_solve = Global->t_in_solve[0];
+ min_mod = max_mod = avg_mod = Global->t_in_mod[0];
+ min_bar = max_bar = avg_bar = Global->t_in_bar[0];
+
+ for (i=1; i<P; i++) {
+ if (Global->t_in_fac[i] > max_fac) {
+ max_fac = Global->t_in_fac[i];
+ }
+ if (Global->t_in_fac[i] < min_fac) {
+ min_fac = Global->t_in_fac[i];
+ }
+ if (Global->t_in_solve[i] > max_solve) {
+ max_solve = Global->t_in_solve[i];
+ }
+ if (Global->t_in_solve[i] < min_solve) {
+ min_solve = Global->t_in_solve[i];
+ }
+ if (Global->t_in_mod[i] > max_mod) {
+ max_mod = Global->t_in_mod[i];
+ }
+ if (Global->t_in_mod[i] < min_mod) {
+ min_mod = Global->t_in_mod[i];
+ }
+ if (Global->t_in_bar[i] > max_bar) {
+ max_bar = Global->t_in_bar[i];
+ }
+ if (Global->t_in_bar[i] < min_bar) {
+ min_bar = Global->t_in_bar[i];
+ }
+ avg_fac += Global->t_in_fac[i];
+ avg_solve += Global->t_in_solve[i];
+ avg_mod += Global->t_in_mod[i];
+ avg_bar += Global->t_in_bar[i];
+ }
+ avg_fac = avg_fac/P;
+ avg_solve = avg_solve/P;
+ avg_mod = avg_mod/P;
+ avg_bar = avg_bar/P;
+ }
+ diag_printf(" PROCESS STATISTICS\n");
+ diag_printf(" Total Diagonal Perimeter Interior Barrier\n");
+ diag_printf(" Proc Time Time Time Time Time\n");
+ diag_printf(" 0 %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ Global->completion[0],Global->t_in_fac[0],
+ Global->t_in_solve[0],Global->t_in_mod[0],
+ Global->t_in_bar[0]);
+ if (dostats) {
+ for (i=1; i<P; i++) {
+ diag_printf(" %3d %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ i,Global->completion[i],Global->t_in_fac[i],
+ Global->t_in_solve[i],Global->t_in_mod[i],
+ Global->t_in_bar[i]);
+ }
+ diag_printf(" Avg %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ avgt,avg_fac,avg_solve,avg_mod,avg_bar);
+ diag_printf(" Min %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ mint,min_fac,min_solve,min_mod,min_bar);
+ diag_printf(" Max %10.0f %10.0f %10.0f %10.0f %10.0f\n",
+ maxt,max_fac,max_solve,max_mod,max_bar);
+ }
+ diag_printf("\n");
+ Global->starttime = start;
+ diag_printf(" TIMING INFORMATION\n");
+ diag_printf("Start time : %16d\n",
+ Global->starttime);
+ diag_printf("Initialization finish time : %16d\n",
+ Global->rs);
+ diag_printf("Overall finish time : %16d\n",
+ Global->rf);
+ diag_printf("Total time with initialization : %16d\n",
+ Global->rf-Global->starttime);
+ diag_printf("Total time without initialization : %16d\n",
+ Global->rf-Global->rs);
+ diag_printf("\n");
+
+ if (test_result) {
+ diag_printf(" TESTING RESULTS\n");
+ CheckResult(n, a, rhs);
+ }
+
+
+{
+ diag_printf("FP test end\n");
+ return 0;
+}
+;
+}
+
+void SlaveStart()
+
+{
+ int i;
+ int j;
+ int cluster;
+ int max_block;
+ int MyNum;
+
+ cyg_spinlock_spin(&Global->idlock);
+ MyNum = Global->id;
+ Global->id ++;
+ cyg_spinlock_clear(&Global->idlock);
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ diag_printf("Slave start\n");
+
+ OneSolve(n, block_size, a, MyNum, dostats);
+
+ thread_array[MyNum] = 0;
+
+}
+
+
+void OneSolve(n, block_size, a, MyNum, dostats)
+
+double *a;
+int n;
+int block_size;
+int MyNum;
+int dostats;
+
+{
+ unsigned int i;
+ unsigned int myrs;
+ unsigned int myrf;
+ unsigned int mydone;
+ struct LocalCopies *lc;
+
+ lc = (struct LocalCopies *) malloc(sizeof(struct LocalCopies));
+ if (lc == NULL) {
+ diag_printf("Proc %d could not malloc memory for lc\n",MyNum);
+ exit(-1);
+ }
+ lc->t_in_fac = 0.0;
+ lc->t_in_solve = 0.0;
+ lc->t_in_mod = 0.0;
+ lc->t_in_bar = 0.0;
+
+ /* barrier to ensure all initialization is done */
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+ /* to remove cold-start misses, all processors begin by touching a[] */
+ TouchA(block_size, MyNum);
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (myrs) = cyg_current_time()*10;
+}
+;
+ }
+
+ diag_printf("Start calc\n");
+ lu(n, block_size, MyNum, lc, dostats);
+ diag_printf("End calc\n");
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (mydone) = cyg_current_time()*10;
+}
+;
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (myrf) = cyg_current_time()*10;
+}
+;
+ Global->t_in_fac[MyNum] = lc->t_in_fac;
+ Global->t_in_solve[MyNum] = lc->t_in_solve;
+ Global->t_in_mod[MyNum] = lc->t_in_mod;
+ Global->t_in_bar[MyNum] = lc->t_in_bar;
+ Global->completion[MyNum] = mydone-myrs;
+ }
+ if (MyNum == 0) {
+ Global->rs = myrs;
+ Global->done = mydone;
+ Global->rf = myrf;
+ }
+}
+
+
+void lu0(a, n, stride)
+
+double *a;
+int n;
+int stride;
+
+{
+ int j;
+ int k;
+ int length;
+ double alpha;
+
+ for (k=0; k<n; k++) {
+ /* modify subsequent columns */
+ for (j=k+1; j<n; j++) {
+ a[k+j*stride] /= a[k+k*stride];
+ alpha = -a[k+j*stride];
+ length = n-k-1;
+ daxpy(&a[k+1+j*stride], &a[k+1+k*stride], n-k-1, alpha);
+ }
+ }
+}
+
+
+void bdiv(a, diag, stride_a, stride_diag, dimi, dimk)
+
+double *a;
+double *diag;
+int stride_a;
+int stride_diag;
+int dimi;
+int dimk;
+
+{
+ int j;
+ int k;
+ double alpha;
+
+ for (k=0; k<dimk; k++) {
+ for (j=k+1; j<dimk; j++) {
+ alpha = -diag[k+j*stride_diag];
+ daxpy(&a[j*stride_a], &a[k*stride_a], dimi, alpha);
+ }
+ }
+}
+
+
+void bmodd(a, c, dimi, dimj, stride_a, stride_c)
+
+double *a;
+double *c;
+int dimi;
+int dimj;
+int stride_a;
+int stride_c;
+
+{
+ int i;
+ int j;
+ int k;
+ int length;
+ double alpha;
+
+ for (k=0; k<dimi; k++)
+ for (j=0; j<dimj; j++) {
+ c[k+j*stride_c] /= a[k+k*stride_a];
+ alpha = -c[k+j*stride_c];
+ length = dimi - k - 1;
+ daxpy(&c[k+1+j*stride_c], &a[k+1+k*stride_a], dimi-k-1, alpha);
+ }
+}
+
+
+void bmod(a, b, c, dimi, dimj, dimk, stride)
+
+double *a;
+double *b;
+double *c;
+int dimi;
+int dimj;
+int dimk;
+int stride;
+
+{
+ int i;
+ int j;
+ int k;
+ double alpha;
+
+ for (k=0; k<dimk; k++) {
+ for (j=0; j<dimj; j++) {
+ alpha = -b[k+j*stride];
+ daxpy(&c[j*stride], &a[k*stride], dimi, alpha);
+ }
+ }
+}
+
+
+void daxpy(a, b, n, alpha)
+
+double *a;
+double *b;
+double alpha;
+int n;
+
+{
+ int i;
+
+ for (i=0; i<n; i++) {
+ a[i] += alpha*b[i];
+ }
+}
+
+
+int BlockOwner(I, J)
+
+int I;
+int J;
+
+{
+ return((I%num_cols) + (J%num_rows)*num_cols);
+}
+
+
+void lu(n, bs, MyNum, lc, dostats)
+
+int n;
+int bs;
+int MyNum;
+struct LocalCopies *lc;
+int dostats;
+
+{
+ int i, il, j, jl, k, kl;
+ int I, J, K;
+ double *A, *B, *C, *D;
+ int dimI, dimJ, dimK;
+ int strI;
+ unsigned int t1, t2, t3, t4, t11, t22;
+ int diagowner;
+ int colowner;
+
+ strI = n;
+ for (k=0, K=0; k<n; k+=bs, K++) {
+ kl = k+bs;
+ if (kl>n) {
+ kl = n;
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (t1) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* factor diagonal block */
+ diagowner = BlockOwner(K, K);
+ if (diagowner == MyNum) {
+ A = &(a[k+k*n]);
+ lu0(A, kl-k, strI);
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (t11) = cyg_current_time()*10;
+}
+;
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (t2) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* divide column k by diagonal block */
+ D = &(a[k+k*n]);
+ for (i=kl, I=K+1; i<n; i+=bs, I++) {
+ if (BlockOwner(I, K) == MyNum) { /* parcel out blocks */
+ il = i + bs;
+ if (il > n) {
+ il = n;
+ }
+ A = &(a[i+k*n]);
+ bdiv(A, D, strI, n, il-i, kl-k);
+ }
+ }
+ /* modify row k by diagonal block */
+ for (j=kl, J=K+1; j<n; j+=bs, J++) {
+ if (BlockOwner(K, J) == MyNum) { /* parcel out blocks */
+ jl = j+bs;
+ if (jl > n) {
+ jl = n;
+ }
+ A = &(a[k+j*n]);
+ bmodd(D, A, kl-k, jl-j, n, strI);
+ }
+ }
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (t22) = cyg_current_time()*10;
+}
+;
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( Global->start[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] < (P -1) ) {
+ Global->start[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&Global->start[0].lock );;
+ { cyg_semaphore_wait(&Global->start[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ if ( Global->start[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&Global->start[0].lock );;
+
+ } else {
+ ( Global->start[0].count[0] )-- ;
+ //cyg_spinlock_clear(&Global->start[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,Global->start[0].count[0],Global->start[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&Global->start[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (t3) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* modify subsequent block columns */
+ for (i=kl, I=K+1; i<n; i+=bs, I++) {
+ il = i+bs;
+ if (il > n) {
+ il = n;
+ }
+ colowner = BlockOwner(I,K);
+ A = &(a[i+k*n]);
+ for (j=kl, J=K+1; j<n; j+=bs, J++) {
+ jl = j + bs;
+ if (jl > n) {
+ jl = n;
+ }
+ if (BlockOwner(I, J) == MyNum) { /* parcel out blocks */
+ B = &(a[k+j*n]);
+ C = &(a[i+j*n]);
+ bmod(A, B, C, il-i, jl-j, kl-k, n);
+ }
+ }
+ }
+ if ((MyNum == 0) || (dostats)) {
+ {
+ (t4) = cyg_current_time()*10;
+}
+;
+ lc->t_in_fac += (t11-t1);
+ lc->t_in_solve += (t22-t2);
+ lc->t_in_mod += (t4-t3);
+ lc->t_in_bar += (t2-t11) + (t3-t22);
+ }
+ }
+}
+
+
+void InitA(rhs)
+
+double *rhs;
+
+{
+ int i, j;
+
+ srand(1);
+ for (j=0; j<n; j++) {
+ for (i=0; i<n; i++) {
+ a[i+j*n] = (double) rand()/MAXRAND;
+ if (i == j) {
+ a[i+j*n] *= 10;
+ }
+ }
+ }
+
+ for (j=0; j<n; j++) {
+ rhs[j] = 0.0;
+ }
+ for (j=0; j<n; j++) {
+ for (i=0; i<n; i++) {
+ rhs[i] += a[i+j*n];
+ }
+ }
+}
+
+
+double TouchA(bs, MyNum)
+
+int bs;
+int MyNum;
+
+{
+ int i, j, I, J;
+ double tot = 0.0;
+
+ for (J=0; J*bs<n; J++) {
+ for (I=0; I*bs<n; I++) {
+ if (BlockOwner(I, J) == MyNum) {
+ for (j=J*bs; j<(J+1)*bs && j<n; j++) {
+ for (i=I*bs; i<(I+1)*bs && i<n; i++) {
+ tot += a[i+j*n];
+ }
+ }
+ }
+ }
+ }
+ return(tot);
+}
+
+
+void PrintA()
+{
+ int i, j;
+
+ for (i=0; i<n; i++) {
+ for (j=0; j<n; j++) {
+ diag_printf("%8.1f ", a[i+j*n]);
+ }
+ diag_printf("\n");
+ }
+}
+
+
+void CheckResult(n, a, rhs)
+
+int n;
+double *a;
+double *rhs;
+
+{
+ int i, j, bogus = 0;
+ double *y, diff, max_diff;
+
+ y = (double *) malloc(n*sizeof(double));
+ if (y == NULL) {
+ printerr("Could not malloc memory for y\n");
+ exit(-1);
+ }
+ for (j=0; j<n; j++) {
+ y[j] = rhs[j];
+ }
+ for (j=0; j<n; j++) {
+ y[j] = y[j]/a[j+j*n];
+ for (i=j+1; i<n; i++) {
+ y[i] -= a[i+j*n]*y[j];
+ }
+ }
+
+ for (j=n-1; j>=0; j--) {
+ for (i=0; i<j; i++) {
+ y[i] -= a[i+j*n]*y[j];
+ }
+ }
+
+ max_diff = 0.0;
+ for (j=0; j<n; j++) {
+ diff = y[j] - 1.0;
+ if (fabs(diff) > 0.00001) {
+ bogus = 1;
+ max_diff = diff;
+ }
+ }
+ if (bogus) {
+ diag_printf("TEST FAILED: (%.5f diff)\n", max_diff);
+ } else {
+ diag_printf("TEST PASSED\n");
+ }
+ free(y);
+}
+
+
+void printerr(s)
+
+char *s;
+
+{
+ diag_printf("ERROR: %s\n",s);
+}
+
+
+#include <tests_smp/getopt.c>
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/lu_arg.c b/cesar/ecos/packages/kernel/current/tests_smp/lu_arg.c
new file mode 100644
index 0000000000..fb02575185
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/lu_arg.c
@@ -0,0 +1,12 @@
+//___________________________________________
+#define _str_HAL_SMP_CPU_MAX(x) #x
+#define smp_cyg_test_argc 5
+char *_cyg_argv[] = {
+ "fft.exe",
+ "-p",
+ "2", /* not used, using HAL_SMP_CPU_MAX */
+ "-n",
+ "64"
+};
+#define smp_cyg_test_argv &_cyg_argv
+//'''''''''''''''''''''''''''''''''''''''''''
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/makefile b/cesar/ecos/packages/kernel/current/tests_smp/makefile
new file mode 100644
index 0000000000..2c6df8b3d8
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/makefile
@@ -0,0 +1,13 @@
+MACROS = c.m4.ecos
+
+all: fft.c radix.c lu.c
+
+fft.c: fft.C $(MACROS)
+ m4 $(MACROS) fft.C > fft.c
+
+radix.c: radix.C $(MACROS)
+ m4 $(MACROS) radix.C > radix.c
+
+lu.c: lu.C $(MACROS)
+ m4 $(MACROS) lu.C > lu.c
+
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/radix.C b/cesar/ecos/packages/kernel/current/tests_smp/radix.C
new file mode 100644
index 0000000000..b23dd6d356
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/radix.C
@@ -0,0 +1,909 @@
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Integer radix sort of non-negative integers. */
+/* */
+/* Command line options: */
+/* */
+/* -pP : P = number of processors. */
+/* -rR : R = radix for sorting. Must be power of 2. */
+/* -nN : N = number of keys to sort. */
+/* -mM : M = maximum key value. Integer keys k will be generated such */
+/* that 0 <= k <= M. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Check to make sure all keys are sorted correctly. */
+/* -o : Print out sorted keys. */
+/* -h : Print out command line options. */
+/* */
+/* Default: RADIX -p1 -n262144 -r1024 -m524288 */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <tests_smp/getopt.h>
+
+#define DEFAULT_P 1
+#define DEFAULT_N 262144
+#define DEFAULT_R 1024
+#define DEFAULT_M 524288
+#define MAX_PROCESSORS 64
+#define RADIX_S 8388608.0e0
+#define RADIX 70368744177664.0e0
+#define SEED 314159265.0e0
+#define RATIO 1220703125.0e0
+#define PAGE_SIZE 4096
+#define PAGE_MASK (~(PAGE_SIZE-1))
+#define MAX_RADIX 4096
+
+#include <tests_smp/radix_arg.c>
+
+MAIN_ENV
+
+struct prefix_node {
+ int densities[MAX_RADIX];
+ int ranks[MAX_RADIX];
+ PAUSEDEC(done)
+ char pad[PAGE_SIZE];
+};
+
+struct global_memory {
+ int Index; /* process ID */
+ LOCKDEC(lock_Index) /* for fetch and add to get ID */
+ LOCKDEC(rank_lock) /* for fetch and add to get ID */
+ ALOCKDEC(section_lock,MAX_PROCESSORS) /* key locks */
+ BARDEC(barrier_rank) /* for ranking process */
+ BARDEC(barrier_key) /* for key sorting process */
+ double *ranktime;
+ double *sorttime;
+ double *totaltime;
+ int final;
+ unsigned int starttime;
+ unsigned int rs;
+ unsigned int rf;
+ struct prefix_node prefix_tree[2 * MAX_PROCESSORS];
+} *global;
+
+struct global_private {
+ char pad[PAGE_SIZE];
+ int *rank_ff; /* overall processor ranks */
+} gp[MAX_PROCESSORS];
+
+int *key[2]; /* sort from one index into the other */
+int **rank_me; /* individual processor ranks */
+int *key_partition; /* keys a processor works on */
+int *rank_partition; /* ranks a processor works on */
+
+int number_of_processors = DEFAULT_P;
+int max_num_digits;
+int radix = DEFAULT_R;
+int num_keys = DEFAULT_N;
+int max_key = DEFAULT_M;
+int log2_radix;
+int log2_keys;
+int dostats = 0;
+int test_result = 0;
+int doprint = 0;
+volatile cyg_thread *thread_array[64];
+
+int dbg_on = 0;
+double ran_num_init(unsigned int,double,double);
+double product_mod_46(double,double);
+int get_max_digits(int);
+int get_log2_radix(int);
+int get_log2_keys(int);
+void slave_sort();
+int log_2(int);
+void printerr(char *);
+void init(int,int,int);
+void test_sort(int);
+void printout();
+
+
+int smp_cyg_test_main(argc, argv)
+
+int argc;
+char **argv;
+
+{
+ int i;
+ int p;
+ int quotient;
+ int remainder;
+ int sum_i;
+ int sum_f;
+ int mistake=0;
+ int size;
+ int **temp;
+ int **temp2;
+ int *a;
+ int c;
+ int n1;
+ extern char *optarg;
+ double mint, maxt, avgt;
+ double minrank, maxrank, avgrank;
+ double minsort, maxsort, avgsort;
+ unsigned int start;
+ int done = 0;
+ int start_p;
+ int end_p;
+ int level;
+ int index;
+ int base;
+ int offset;
+ int toffset;
+
+
+ memset (thread_array,0,sizeof(thread_array));
+
+ CLOCK(start)
+
+ while ((c = getopt(argc, argv, "p:r:n:m:stoh")) != -1) {
+ switch(c) {
+ case 'p': number_of_processors = atoi(optarg);
+ number_of_processors = HAL_SMP_CPU_MAX;
+ if (number_of_processors < 1) {
+ printerr("P must be >= 1\n");
+ exit(-1);
+ }
+ if (number_of_processors > MAX_PROCESSORS) {
+ printerr("Maximum processors (MAX_PROCESSORS) exceeded\n");
+ exit(-1);
+ }
+ break;
+ case 'r': radix = atoi(optarg);
+ if (radix < 1) {
+ printerr("Radix must be a power of 2 greater than 0\n");
+ exit(-1);
+ }
+ log2_radix = log_2(radix);
+ if (log2_radix == -1) {
+ printerr("Radix must be a power of 2\n");
+ exit(-1);
+ }
+ break;
+ case 'n': num_keys = atoi(optarg);
+ if (num_keys < 1) {
+ printerr("Number of keys must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 'm': max_key = atoi(optarg);
+ if (max_key < 1) {
+ printerr("Maximum key must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 's': dostats = !dostats;
+ break;
+ case 't': test_result = !test_result;
+ break;
+ case 'o': doprint = !doprint;
+ break;
+ case 'h': diag_printf("Usage: RADIX <options>\n\n");
+ diag_printf(" -pP : P = number of processors.\n");
+ diag_printf(" -rR : R = radix for sorting. Must be power of 2.\n");
+ diag_printf(" -nN : N = number of keys to sort.\n");
+ diag_printf(" -mM : M = maximum key value. Integer keys k will be generated such\n");
+ diag_printf(" that 0 <= k <= M.\n");
+ diag_printf(" -s : Print individual processor timing statistics.\n");
+ diag_printf(" -t : Check to make sure all keys are sorted correctly.\n");
+ diag_printf(" -o : Print out sorted keys.\n");
+ diag_printf(" -h : Print out command line options.\n\n");
+ diag_printf("Default: RADIX -p%1d -n%1d -r%1d -m%1d\n",
+ DEFAULT_P,DEFAULT_N,DEFAULT_R,DEFAULT_M);
+ exit(0);
+ }
+ }
+
+ if (number_of_processors > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ MAIN_INITENV(,80000000)
+
+ log2_radix = log_2(radix);
+ log2_keys = log_2(num_keys);
+ global = (struct global_memory *) G_MALLOC(sizeof(struct global_memory))
+ key[0] = (int *) G_MALLOC(num_keys*sizeof(int));
+ key[1] = (int *) G_MALLOC(num_keys*sizeof(int));
+ key_partition = (int *) G_MALLOC((number_of_processors+1)*sizeof(int));
+ rank_partition = (int *) G_MALLOC((number_of_processors+1)*sizeof(int));
+
+ if ((global == NULL)) {
+ diag_printf("ERROR: Cannot malloc enough memory %d\n",sizeof(struct global_memory));
+ }
+
+ global->ranktime = (double *) G_MALLOC(number_of_processors*sizeof(double));
+ global->sorttime = (double *) G_MALLOC(number_of_processors*sizeof(double));
+ global->totaltime = (double *) G_MALLOC(number_of_processors*sizeof(double));
+ size = number_of_processors*(radix*sizeof(int)+sizeof(int *));
+ rank_me = (int **) G_MALLOC(size);
+
+ if ((global == NULL) || (key[0] == NULL) || (key[1] == NULL) ||
+ (key_partition == NULL) || (rank_partition == NULL) ||
+ (rank_me == NULL)) {
+ diag_printf("ERROR: Cannot malloc enough memory\n");
+ exit(-1);
+ }
+
+ temp = rank_me;
+ temp2 = temp + number_of_processors;
+ a = (int *) temp2;
+ for (i=0;i<number_of_processors;i++) {
+ *temp = (int *) a;
+ temp++;
+ a += radix;
+ }
+ for (i=0;i<number_of_processors;i++) {
+ gp[i].rank_ff = (int *) G_MALLOC(radix*sizeof(int)+PAGE_SIZE);
+ }
+ LOCKINIT(global->lock_Index)
+ LOCKINIT(global->rank_lock)
+ ALOCKINIT(global->section_lock,MAX_PROCESSORS)
+ BARINIT(global->barrier_rank)
+ BARINIT(global->barrier_key)
+
+ for (i=0; i<2*number_of_processors; i++) {
+ PAUSEINIT(global->prefix_tree[i].done);
+ }
+
+ global->Index = 0;
+ max_num_digits = get_max_digits(max_key);
+ diag_printf("\n");
+ diag_printf("Integer Radix Sort\n");
+ diag_printf(" %d Keys\n",num_keys);
+ diag_printf(" %d Processors\n",number_of_processors);
+ diag_printf(" Radix = %d\n",radix);
+ diag_printf(" Max key = %d\n",max_key);
+ diag_printf("\n");
+
+ quotient = num_keys / number_of_processors;
+ remainder = num_keys % number_of_processors;
+ sum_i = 0;
+ sum_f = 0;
+ p = 0;
+ diag_printf("key_partition\n",max_key);
+ while (sum_i < num_keys) {
+ key_partition[p] = sum_i;
+ p++;
+ sum_i = sum_i + quotient;
+ sum_f = sum_f + remainder;
+ sum_i = sum_i + sum_f / number_of_processors;
+ sum_f = sum_f % number_of_processors;
+ }
+ key_partition[p] = num_keys;
+
+ quotient = radix / number_of_processors;
+ remainder = radix % number_of_processors;
+ sum_i = 0;
+ sum_f = 0;
+ p = 0;
+ diag_printf("rank_partition\n",max_key);
+ while (sum_i < radix) {
+ rank_partition[p] = sum_i;
+ p++;
+ sum_i = sum_i + quotient;
+ sum_f = sum_f + remainder;
+ sum_i = sum_i + sum_f / number_of_processors;
+ sum_f = sum_f % number_of_processors;
+ }
+ rank_partition[p] = radix;
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the key,
+ rank_me, rank, and gp data structures across physically
+ distributed memories as desired.
+
+ One way to place data is as follows:
+
+ for (i=0;i<number_of_processors;i++) {
+ Place all addresses x such that:
+ &(key[0][key_partition[i]]) <= x < &(key[0][key_partition[i+1]])
+ on node i
+ &(key[1][key_partition[i]]) <= x < &(key[1][key_partition[i+1]])
+ on node i
+ &(rank_me[i][0]) <= x < &(rank_me[i][radix-1]) on node i
+ &(gp[i]) <= x < &(gp[i+1]) on node i
+ &(gp[i].rank_ff[0]) <= x < &(gp[i].rank_ff[radix]) on node i
+ }
+ start_p = 0;
+ i = 0;
+
+ for (toffset = 0; toffset < number_of_processors; toffset ++) {
+ offset = toffset;
+ level = number_of_processors >> 1;
+ base = number_of_processors;
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ index = base + offset;
+ Place all addresses x such that:
+ &(global->prefix_tree[index]) <= x <
+ &(global->prefix_tree[index + 1]) on node toffset
+ base += level;
+ level >>= 1;
+ }
+ } */
+
+ /* Fill the random-number array. */
+
+ for (i = 1; i < number_of_processors; i++) {
+ CREATE(slave_sort,thread_array[i])
+ }
+
+ slave_sort();
+
+ for (i = 1; i < number_of_processors; i++) {
+ while(thread_array[i]) {};
+ }
+
+ diag_printf("\n");
+ diag_printf(" PROCESS STATISTICS\n");
+ diag_printf(" Total Rank Sort\n");
+ diag_printf(" Proc Time Time Time\n");
+ diag_printf(" 0 %10.0f %10.0f %10.0f\n",
+ global->totaltime[0],global->ranktime[0],
+ global->sorttime[0]);
+ if (dostats) {
+ maxt = avgt = mint = global->totaltime[0];
+ maxrank = avgrank = minrank = global->ranktime[0];
+ maxsort = avgsort = minsort = global->sorttime[0];
+ for (i=1; i<number_of_processors; i++) {
+ if (global->totaltime[i] > maxt) {
+ maxt = global->totaltime[i];
+ }
+ if (global->totaltime[i] < mint) {
+ mint = global->totaltime[i];
+ }
+ if (global->ranktime[i] > maxrank) {
+ maxrank = global->ranktime[i];
+ }
+ if (global->ranktime[i] < minrank) {
+ minrank = global->ranktime[i];
+ }
+ if (global->sorttime[i] > maxsort) {
+ maxsort = global->sorttime[i];
+ }
+ if (global->sorttime[i] < minsort) {
+ minsort = global->sorttime[i];
+ }
+ avgt += global->totaltime[i];
+ avgrank += global->ranktime[i];
+ avgsort += global->sorttime[i];
+ }
+ avgt = avgt / number_of_processors;
+ avgrank = avgrank / number_of_processors;
+ avgsort = avgsort / number_of_processors;
+ for (i=1; i<number_of_processors; i++) {
+ diag_printf(" %3d %10.0f %10.0f %10.0f\n",
+ i,global->totaltime[i],global->ranktime[i],
+ global->sorttime[i]);
+ }
+ diag_printf(" Avg %10.0f %10.0f %10.0f\n",avgt,avgrank,avgsort);
+ diag_printf(" Min %10.0f %10.0f %10.0f\n",mint,minrank,minsort);
+ diag_printf(" Max %10.0f %10.0f %10.0f\n",maxt,maxrank,maxsort);
+ diag_printf("\n");
+ }
+
+ diag_printf("\n");
+ global->starttime = start;
+ diag_printf(" TIMING INFORMATION\n");
+ diag_printf("Start time : %16d\n",
+ global->starttime);
+ diag_printf("Initialization finish time : %16d\n",
+ global->rs);
+ diag_printf("Overall finish time : %16d\n",
+ global->rf);
+ diag_printf("Total time with initialization : %16d\n",
+ global->rf-global->starttime);
+ diag_printf("Total time without initialization : %16d\n",
+ global->rf-global->rs);
+ diag_printf("\n");
+
+ if (doprint) {
+ printout();
+ }
+ if (test_result) {
+ test_sort(global->final);
+ }
+
+ MAIN_END;
+}
+
+void slave_sort()
+{
+ int i, j, k, kk, Ind;
+ int MyNum;
+ int this_key;
+ int tmp;
+ int last_key;
+ int loopnum;
+ double ran_num;
+ double sum;
+ int shiftnum;
+ int bb;
+ int my_key;
+ int key_start;
+ int key_stop;
+ int rank_start;
+ int rank_stop;
+ int from=0;
+ int to=1;
+ int *key_density; /* individual processor key densities */
+ unsigned int time1;
+ unsigned int time2;
+ unsigned int time3;
+ unsigned int time4;
+ unsigned int time5;
+ unsigned int time6;
+ double ranktime=0;
+ double sorttime=0;
+ int *key_from;
+ int *key_to;
+ int *rank_me_mynum;
+ int *rank_me_i;
+ int *rank_ff_mynum;
+ int stats;
+ struct prefix_node* n;
+ struct prefix_node* r;
+ struct prefix_node* l;
+ struct prefix_node* my_node;
+ struct prefix_node* their_node;
+ volatile int* prefx;
+ int index;
+ int level;
+ int base;
+ int offset;
+
+ diag_printf("SlaveSort %d\n",cyg_hal_get_current_threadid());
+
+ stats = dostats;
+
+ LOCK(global->lock_Index)
+ MyNum = global->Index;
+ global->Index++;
+ UNLOCK(global->lock_Index)
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ key_density = (int *) malloc(radix*sizeof(int));
+
+ /* Fill the random-number array. */
+
+ key_start = key_partition[MyNum];
+ key_stop = key_partition[MyNum + 1];
+ rank_start = rank_partition[MyNum];
+ rank_stop = rank_partition[MyNum + 1];
+ if (rank_stop == radix) {
+ rank_stop--;
+ }
+
+ diag_printf("init %d, %d-%d\n",cyg_hal_get_current_threadid(),key_start,key_stop);
+ init(key_start,key_stop,from);
+
+ BARRIER(global->barrier_key, number_of_processors, 1 )
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ BARRIER(global->barrier_key, number_of_processors, 2)
+
+ if ((MyNum == 0) || (stats)) {
+ CLOCK(time1)
+ }
+
+/* Do 1 iteration per digit. */
+
+ rank_me_mynum = rank_me[MyNum];
+ rank_ff_mynum = gp[MyNum].rank_ff;
+ for (loopnum=0;loopnum<max_num_digits;loopnum++) {
+ shiftnum = (loopnum * log2_radix);
+ bb = (radix-1) << shiftnum;
+
+/* generate histograms based on one digit */
+
+ if ((MyNum == 0) || (stats)) {
+ CLOCK(time2)
+ }
+
+ for (i = 0; i < radix; i++) {
+ rank_me_mynum[i] = 0;
+ }
+ key_from = (int *) key[from];
+ key_to = (int *) key[to];
+ for (i=key_start;i<key_stop;i++) {
+ my_key = key_from[i] & bb;
+ my_key = my_key >> shiftnum;
+ rank_me_mynum[my_key]++;
+ }
+ key_density[0] = rank_me_mynum[0];
+ for (i=1;i<radix;i++) {
+ key_density[i] = key_density[i-1] + rank_me_mynum[i];
+ }
+
+ BARRIER(global->barrier_rank, number_of_processors,3)
+
+ n = &(global->prefix_tree[MyNum]);
+ for (i = 0; i < radix; i++) {
+ n->densities[i] = key_density[i];
+ n->ranks[i] = rank_me_mynum[i];
+ }
+ offset = MyNum;
+ level = number_of_processors >> 1;
+ base = number_of_processors;
+ if ((MyNum & 0x1) == 0) {
+ SETPAUSE(global->prefix_tree[base + (offset >> 1)].done);
+ }
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ r = n;
+ l = n - 1;
+ index = base + offset;
+ n = &(global->prefix_tree[index]);
+ WAITPAUSE(n->done);
+ CLEARPAUSE(n->done);
+ if (offset != (level - 1)) {
+ for (i = 0; i < radix; i++) {
+ n->densities[i] = r->densities[i] + l->densities[i];
+ n->ranks[i] = r->ranks[i] + l->ranks[i];
+ }
+ } else {
+ for (i = 0; i < radix; i++) {
+ n->densities[i] = r->densities[i] + l->densities[i];
+ }
+ }
+ base += level;
+ level >>= 1;
+ if ((offset & 0x1) == 0) {
+ SETPAUSE(global->prefix_tree[base + (offset >> 1)].done);
+ }
+ }
+ BARRIER(global->barrier_rank, number_of_processors,4);
+
+ if (MyNum != (number_of_processors - 1)) {
+ offset = MyNum;
+ level = number_of_processors;
+ base = 0;
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ }
+ my_node = &(global->prefix_tree[base + offset]);
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ }
+ their_node = &(global->prefix_tree[base + offset]);
+ WAITPAUSE(my_node->done);
+ CLEARPAUSE(my_node->done);
+ for (i = 0; i < radix; i++) {
+ my_node->densities[i] = their_node->densities[i];
+ }
+ } else {
+ my_node = &(global->prefix_tree[(2 * number_of_processors) - 2]);
+ }
+ offset = MyNum;
+ level = number_of_processors;
+ base = 0;
+ while ((offset & 0x1) != 0) {
+ SETPAUSE(global->prefix_tree[base + offset - 1].done);
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ }
+ offset = MyNum;
+ level = number_of_processors;
+ base = 0;
+ for(i = 0; i < radix; i++) {
+ rank_ff_mynum[i] = 0;
+ }
+ while (offset != 0) {
+ if ((offset & 0x1) != 0) {
+ /* Add ranks of node to your left at this level */
+ l = &(global->prefix_tree[base + offset - 1]);
+ for (i = 0; i < radix; i++) {
+ rank_ff_mynum[i] += l->ranks[i];
+ }
+ }
+ base += level;
+ level >>= 1;
+ offset >>= 1;
+ }
+ for (i = 1; i < radix; i++) {
+ rank_ff_mynum[i] += my_node->densities[i - 1];
+ }
+
+ if ((MyNum == 0) || (stats)) {
+ CLOCK(time3);
+ }
+
+ BARRIER(global->barrier_rank, number_of_processors,5);
+
+ if ((MyNum == 0) || (stats)) {
+ CLOCK(time4);
+ }
+
+ /* put it in order according to this digit */
+
+ for (i = key_start; i < key_stop; i++) {
+ this_key = key_from[i] & bb;
+ this_key = this_key >> shiftnum;
+ tmp = rank_ff_mynum[this_key];
+ key_to[tmp] = key_from[i];
+ rank_ff_mynum[this_key]++;
+ } /* i */
+
+ if ((MyNum == 0) || (stats)) {
+ CLOCK(time5);
+ }
+
+ if (loopnum != max_num_digits-1) {
+ from = from ^ 0x1;
+ to = to ^ 0x1;
+ }
+
+ BARRIER(global->barrier_rank, number_of_processors, 6)
+
+ if ((MyNum == 0) || (stats)) {
+ ranktime += (time3 - time2);
+ sorttime += (time5 - time4);
+ }
+ } /* for */
+
+ BARRIER(global->barrier_rank, number_of_processors, 7)
+ if ((MyNum == 0) || (stats)) {
+ CLOCK(time6)
+ global->ranktime[MyNum] = ranktime;
+ global->sorttime[MyNum] = sorttime;
+ global->totaltime[MyNum] = time6-time1;
+ }
+ if (MyNum == 0) {
+ global->rs = time1;
+ global->rf = time6;
+ global->final = to;
+ }
+
+ thread_array[MyNum] = 0;
+}
+
+double product_mod_46(t1, t2) /* product_mod_46() returns the product
+ (mod 2^46) of t1 and t2. */
+double t1;
+double t2;
+
+{
+ double a1;
+ double b1;
+ double a2;
+ double b2;
+
+ a1 = (double)((int)(t1 / RADIX_S)); /* Decompose the arguments. */
+ a2 = t1 - a1 * RADIX_S;
+ b1 = (double)((int)(t2 / RADIX_S));
+ b2 = t2 - b1 * RADIX_S;
+ t1 = a1 * b2 + a2 * b1; /* Multiply the arguments. */
+ t2 = (double)((int)(t1 / RADIX_S));
+ t2 = t1 - t2 * RADIX_S;
+ t1 = t2 * RADIX_S + a2 * b2;
+ t2 = (double)((int)(t1 / RADIX));
+
+ return (t1 - t2 * RADIX); /* Return the product. */
+}
+
+double ran_num_init(k, b, t) /* finds the (k)th random number,
+ given the seed, b, and the ratio, t. */
+unsigned int k;
+double b;
+double t;
+
+{
+ unsigned int j;
+
+ while (k != 0) { /* while() is executed m times
+ such that 2^m > k. */
+ j = k >> 1;
+ if ((j << 1) != k) {
+ b = product_mod_46(b, t);
+ }
+ t = product_mod_46(t, t);
+ k = j;
+ }
+
+ return b;
+}
+
+int get_max_digits(max_key)
+
+int max_key;
+
+{
+ int done = 0;
+ int temp = 1;
+ int key_val;
+
+ key_val = max_key;
+ while (!done) {
+ key_val = key_val / radix;
+ if (key_val == 0) {
+ done = 1;
+ } else {
+ temp ++;
+ }
+ }
+ return temp;
+}
+
+int get_log2_radix(rad)
+
+int rad;
+
+{
+ int cumulative=1;
+ int out;
+
+ for (out = 0; out < 20; out++) {
+ if (cumulative == rad) {
+ return(out);
+ } else {
+ cumulative = cumulative * 2;
+ }
+ }
+ diag_printf("ERROR: Radix %d not a power of 2\n", rad);
+ exit(-1);
+}
+
+int get_log2_keys(num_keys)
+
+int num_keys;
+
+{
+ int cumulative=1;
+ int out;
+
+ for (out = 0; out < 30; out++) {
+ if (cumulative == num_keys) {
+ return(out);
+ } else {
+ cumulative = cumulative * 2;
+ }
+ }
+ diag_printf("ERROR: Number of keys %d not a power of 2\n", num_keys);
+ exit(-1);
+}
+
+int log_2(number)
+
+int number;
+
+{
+ int cumulative = 1;
+ int out = 0;
+ int done = 0;
+
+ while ((cumulative < number) && (!done) && (out < 50)) {
+ if (cumulative == number) {
+ done = 1;
+ } else {
+ cumulative = cumulative * 2;
+ out ++;
+ }
+ }
+
+ if (cumulative == number) {
+ return(out);
+ } else {
+ return(-1);
+ }
+}
+
+void printerr(s)
+
+char *s;
+
+{
+ diag_printf("ERROR: %s\n",s);
+}
+
+void init(key_start,key_stop,from)
+
+int key_start;
+int key_stop;
+int from;
+
+{
+ double ran_num;
+ double sum;
+ int tmp;
+ int i;
+ int *key_from;
+
+ ran_num = ran_num_init((key_start << 2) + 1, SEED, RATIO);
+ sum = ran_num / RADIX;
+ key_from = (int *) key[from];
+ for (i = key_start; i < key_stop; i++) {
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = sum + ran_num / RADIX;
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = sum + ran_num / RADIX;
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = sum + ran_num / RADIX;
+ key_from[i] = (int) ((sum / 4.0) * max_key);
+ tmp = (int) ((key_from[i])/100);
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = ran_num / RADIX;
+ }
+
+}
+
+void test_sort(final)
+
+int final;
+
+{
+ int i;
+ int mistake = 0;
+ int *key_final;
+
+ diag_printf("\n");
+ diag_printf(" TESTING RESULTS\n");
+ key_final = key[final];
+ for (i = 0; i < num_keys-1; i++) {
+ if (key_final[i] > key_final[i + 1]) {
+ diag_printf("error with key %d, value %d %d \n",
+ i,key_final[i],key_final[i + 1]);
+ mistake++;
+ }
+ }
+
+ if (mistake) {
+ diag_printf("FAILED: %d keys out of place.\n", mistake);
+ } else {
+ diag_printf("PASSED: All keys in place.\n");
+ }
+ diag_printf("\n");
+}
+
+void printout()
+
+{
+ int i;
+ int mistake;
+ int *key_final;
+
+ key_final = (int *) key[global->final];
+ diag_printf("\n");
+ diag_printf(" SORTED KEY VALUES\n");
+ diag_printf("%8d ",key_final[0]);
+ for (i = 0; i < num_keys-1; i++) {
+ diag_printf("%8d ",key_final[i+1]);
+ if ((i+2)%5 == 0) {
+ diag_printf("\n");
+ }
+ }
+ diag_printf("\n");
+}
+
+#include <tests_smp/getopt.c>
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/radix.c b/cesar/ecos/packages/kernel/current/tests_smp/radix.c
new file mode 100644
index 0000000000..00a14fe9fb
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/radix.c
@@ -0,0 +1,1346 @@
+
+
+
+
+
+
+
+
+
+
+
+/*************************************************************************/
+/* */
+/* Copyright (c) 1994 Stanford University */
+/* */
+/* All rights reserved. */
+/* */
+/* Permission is given to use, copy, and modify this software for any */
+/* non-commercial purpose as long as this copyright notice is not */
+/* removed. All other uses, including redistribution in whole or in */
+/* part, are forbidden without prior written permission. */
+/* */
+/* This software is provided with absolutely no warranty and no */
+/* support. */
+/* */
+/*************************************************************************/
+
+/*************************************************************************/
+/* */
+/* Integer radix sort of non-negative integers. */
+/* */
+/* Command line options: */
+/* */
+/* -pP : P = number of processors. */
+/* -rR : R = radix for sorting. Must be power of 2. */
+/* -nN : N = number of keys to sort. */
+/* -mM : M = maximum key value. Integer keys k will be generated such */
+/* that 0 <= k <= M. */
+/* -s : Print individual processor timing statistics. */
+/* -t : Check to make sure all keys are sorted correctly. */
+/* -o : Print out sorted keys. */
+/* -h : Print out command line options. */
+/* */
+/* Default: RADIX -p1 -n262144 -r1024 -m524288 */
+/* */
+/* Note: This version works under both the FORK and SPROC models */
+/* */
+/*************************************************************************/
+
+#include <stdio.h>
+#include <math.h>
+#include <tests_smp/getopt.h>
+
+#define DEFAULT_P 1
+#define DEFAULT_N 262144
+#define DEFAULT_R 1024
+#define DEFAULT_M 524288
+#define MAX_PROCESSORS 64
+#define RADIX_S 8388608.0e0
+#define RADIX 70368744177664.0e0
+#define SEED 314159265.0e0
+#define RATIO 1220703125.0e0
+#define PAGE_SIZE 4096
+#define PAGE_MASK (~(PAGE_SIZE-1))
+#define MAX_RADIX 4096
+
+#include <tests_smp/radix_arg.c>
+
+
+/*---------------------------------------*/
+#include <pkgconf/system.h>
+#include <pkgconf/infra.h>
+#include <pkgconf/kernel.h>
+#include <cyg/infra/cyg_type.h>
+#include <cyg/infra/testcase.h>
+#include <pkgconf/isoinfra.h>
+#include <cyg/hal/hal_cache.h>
+#include <cyg/hal/hal_smp.h>
+#include <cyg/kernel/kapi.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+int smp_cyg_test_main(int argc, char **argv);
+
+void smp_cyg_test_main_call(void *p) {
+ smp_cyg_test_main(smp_cyg_test_argc, smp_cyg_test_argv);
+}
+
+
+/* #define STACK_SIZE 0x1b000 */
+#define STACK_SIZE 8192
+static cyg_thread smp_cyg_test_thread;
+static char smp_cyg_test_stack[STACK_SIZE];
+static cyg_handle_t smp_cyg_test_threadh;
+
+
+
+cyg_handle_t threadh[64];
+char *stack[64];
+cyg_thread thread[64];
+
+
+externC void cyg_user_start( void )
+{
+ CYG_TEST_INIT();
+
+ diag_printf("Starting test app\n");
+
+ cyg_thread_create(10, // Priority - just a number
+ smp_cyg_test_main_call, // entry
+ 0, // index
+ "smp test", // Name
+ smp_cyg_test_stack, // Stack
+ STACK_SIZE, // Size
+ &smp_cyg_test_threadh, // Handle
+ &smp_cyg_test_thread // Thread data structure
+ );
+ cyg_thread_resume( smp_cyg_test_threadh );
+ /*cyg_scheduler_start();*/
+}
+
+
+/*---------------------------------------*/
+
+
+struct prefix_node {
+ int densities[MAX_RADIX];
+ int ranks[MAX_RADIX];
+
+ char pad[PAGE_SIZE];
+};
+
+struct global_memory {
+ int Index; /* process ID */
+ cyg_spinlock_t lock_Index; /* for fetch and add to get ID */
+ cyg_spinlock_t rank_lock; /* for fetch and add to get ID */
+
+/*---------------------------------------*/
+struct section_lockTYP {
+ cyg_spinlock_t lock;;
+
+
+
+ } section_lock[MAX_PROCESSORS];
+/*---------------------------------------*/
+
+ /* key locks */
+
+/*---------------------------------------*/
+struct barrier_rankTYP {
+ cyg_spinlock_t lock;;
+ int count[1];
+ cyg_sem_t queue[1];;
+
+ } barrier_rank[1];
+/*---------------------------------------*/
+
+ /* for ranking process */
+
+/*---------------------------------------*/
+struct barrier_keyTYP {
+ cyg_spinlock_t lock;;
+ int count[1];
+ cyg_sem_t queue[1];;
+
+ } barrier_key[1];
+/*---------------------------------------*/
+
+ /* for key sorting process */
+ double *ranktime;
+ double *sorttime;
+ double *totaltime;
+ int final;
+ unsigned int starttime;
+ unsigned int rs;
+ unsigned int rf;
+ struct prefix_node prefix_tree[2 * MAX_PROCESSORS];
+} *global;
+
+struct global_private {
+ char pad[PAGE_SIZE];
+ int *rank_ff; /* overall processor ranks */
+} gp[MAX_PROCESSORS];
+
+int *key[2]; /* sort from one index into the other */
+int **rank_me; /* individual processor ranks */
+int *key_partition; /* keys a processor works on */
+int *rank_partition; /* ranks a processor works on */
+
+int number_of_processors = DEFAULT_P;
+int max_num_digits;
+int radix = DEFAULT_R;
+int num_keys = DEFAULT_N;
+int max_key = DEFAULT_M;
+int log2_radix;
+int log2_keys;
+int dostats = 0;
+int test_result = 0;
+int doprint = 0;
+volatile cyg_thread *thread_array[64];
+
+int dbg_on = 0;
+double ran_num_init(unsigned int,double,double);
+double product_mod_46(double,double);
+int get_max_digits(int);
+int get_log2_radix(int);
+int get_log2_keys(int);
+void slave_sort();
+int log_2(int);
+void printerr(char *);
+void init(int,int,int);
+void test_sort(int);
+void printout();
+
+
+int smp_cyg_test_main(argc, argv)
+
+int argc;
+char **argv;
+
+{
+ int i;
+ int p;
+ int quotient;
+ int remainder;
+ int sum_i;
+ int sum_f;
+ int mistake=0;
+ int size;
+ int **temp;
+ int **temp2;
+ int *a;
+ int c;
+ int n1;
+ extern char *optarg;
+ double mint, maxt, avgt;
+ double minrank, maxrank, avgrank;
+ double minsort, maxsort, avgsort;
+ unsigned int start;
+ int done = 0;
+ int start_p;
+ int end_p;
+ int level;
+ int index;
+ int base;
+ int offset;
+ int toffset;
+
+
+ memset (thread_array,0,sizeof(thread_array));
+
+ {
+ (start) = cyg_current_time()*10;
+}
+
+
+ while ((c = getopt(argc, argv, "p:r:n:m:stoh")) != -1) {
+ switch(c) {
+ case 'p': number_of_processors = atoi(optarg);
+ number_of_processors = HAL_SMP_CPU_MAX;
+ if (number_of_processors < 1) {
+ printerr("P must be >= 1\n");
+ exit(-1);
+ }
+ if (number_of_processors > MAX_PROCESSORS) {
+ printerr("Maximum processors (MAX_PROCESSORS) exceeded\n");
+ exit(-1);
+ }
+ break;
+ case 'r': radix = atoi(optarg);
+ if (radix < 1) {
+ printerr("Radix must be a power of 2 greater than 0\n");
+ exit(-1);
+ }
+ log2_radix = log_2(radix);
+ if (log2_radix == -1) {
+ printerr("Radix must be a power of 2\n");
+ exit(-1);
+ }
+ break;
+ case 'n': num_keys = atoi(optarg);
+ if (num_keys < 1) {
+ printerr("Number of keys must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 'm': max_key = atoi(optarg);
+ if (max_key < 1) {
+ printerr("Maximum key must be >= 1\n");
+ exit(-1);
+ }
+ break;
+ case 's': dostats = !dostats;
+ break;
+ case 't': test_result = !test_result;
+ break;
+ case 'o': doprint = !doprint;
+ break;
+ case 'h': diag_printf("Usage: RADIX <options>\n\n");
+ diag_printf(" -pP : P = number of processors.\n");
+ diag_printf(" -rR : R = radix for sorting. Must be power of 2.\n");
+ diag_printf(" -nN : N = number of keys to sort.\n");
+ diag_printf(" -mM : M = maximum key value. Integer keys k will be generated such\n");
+ diag_printf(" that 0 <= k <= M.\n");
+ diag_printf(" -s : Print individual processor timing statistics.\n");
+ diag_printf(" -t : Check to make sure all keys are sorted correctly.\n");
+ diag_printf(" -o : Print out sorted keys.\n");
+ diag_printf(" -h : Print out command line options.\n\n");
+ diag_printf("Default: RADIX -p%1d -n%1d -r%1d -m%1d\n",
+ DEFAULT_P,DEFAULT_N,DEFAULT_R,DEFAULT_M);
+ exit(0);
+ }
+ }
+
+ if (number_of_processors > 64) {
+ printf("Maximal 64 processes\n");
+ exit(0);
+ }
+
+ {;}
+
+ log2_radix = log_2(radix);
+ log2_keys = log_2(num_keys);
+ global = (struct global_memory *) calloc(sizeof(struct global_memory), 1);
+ key[0] = (int *) calloc(num_keys*sizeof(int), 1);;
+ key[1] = (int *) calloc(num_keys*sizeof(int), 1);;
+ key_partition = (int *) calloc((number_of_processors+1)*sizeof(int), 1);;
+ rank_partition = (int *) calloc((number_of_processors+1)*sizeof(int), 1);;
+
+ if ((global == NULL)) {
+ diag_printf("ERROR: Cannot malloc enough memory %d\n",sizeof(struct global_memory));
+ }
+
+ global->ranktime = (double *) calloc(number_of_processors*sizeof(double), 1);;
+ global->sorttime = (double *) calloc(number_of_processors*sizeof(double), 1);;
+ global->totaltime = (double *) calloc(number_of_processors*sizeof(double), 1);;
+ size = number_of_processors*(radix*sizeof(int)+sizeof(int *));
+ rank_me = (int **) calloc(size, 1);;
+
+ if ((global == NULL) || (key[0] == NULL) || (key[1] == NULL) ||
+ (key_partition == NULL) || (rank_partition == NULL) ||
+ (rank_me == NULL)) {
+ diag_printf("ERROR: Cannot malloc enough memory\n");
+ exit(-1);
+ }
+
+ temp = rank_me;
+ temp2 = temp + number_of_processors;
+ a = (int *) temp2;
+ for (i=0;i<number_of_processors;i++) {
+ *temp = (int *) a;
+ temp++;
+ a += radix;
+ }
+ for (i=0;i<number_of_processors;i++) {
+ gp[i].rank_ff = (int *) calloc(radix*sizeof(int)+PAGE_SIZE, 1);;
+ }
+ cyg_spinlock_init( &global->lock_Index,0 );
+ cyg_spinlock_init( &global->rank_lock,0 );
+ { {
+/*---------------------------------------*/
+ int mon_dum1,mon_dum2;
+
+ for (mon_dum1=0; mon_dum1 < MAX_PROCESSORS; mon_dum1++) {
+ cyg_spinlock_init( &global->section_lock[mon_dum1].lock,0 );;
+ }
+/*---------------------------------------*/
+};}
+ {{
+/*---------------------------------------*/
+ int mon_dum1,mon_dum2;
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++)
+ for (mon_dum2=0; mon_dum2 < 1; mon_dum2++) {
+ global->barrier_rank[mon_dum1].count[mon_dum2] = 0;
+ { cyg_semaphore_init(&global->barrier_rank[mon_dum1].queue[mon_dum2],0);};
+ }
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++) {
+ cyg_spinlock_init( &global->barrier_rank[mon_dum1].lock,0 );;
+ }
+/*---------------------------------------*/
+}}
+ {{
+/*---------------------------------------*/
+ int mon_dum1,mon_dum2;
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++)
+ for (mon_dum2=0; mon_dum2 < 1; mon_dum2++) {
+ global->barrier_key[mon_dum1].count[mon_dum2] = 0;
+ { cyg_semaphore_init(&global->barrier_key[mon_dum1].queue[mon_dum2],0);};
+ }
+ for (mon_dum1=0; mon_dum1 < 1; mon_dum1++) {
+ cyg_spinlock_init( &global->barrier_key[mon_dum1].lock,0 );;
+ }
+/*---------------------------------------*/
+}}
+
+ for (i=0; i<2*number_of_processors; i++) {
+ {/*##########*/;};
+ }
+
+ global->Index = 0;
+ max_num_digits = get_max_digits(max_key);
+ diag_printf("\n");
+ diag_printf("Integer Radix Sort\n");
+ diag_printf(" %d Keys\n",num_keys);
+ diag_printf(" %d Processors\n",number_of_processors);
+ diag_printf(" Radix = %d\n",radix);
+ diag_printf(" Max key = %d\n",max_key);
+ diag_printf("\n");
+
+ quotient = num_keys / number_of_processors;
+ remainder = num_keys % number_of_processors;
+ sum_i = 0;
+ sum_f = 0;
+ p = 0;
+ diag_printf("key_partition\n",max_key);
+ while (sum_i < num_keys) {
+ key_partition[p] = sum_i;
+ p++;
+ sum_i = sum_i + quotient;
+ sum_f = sum_f + remainder;
+ sum_i = sum_i + sum_f / number_of_processors;
+ sum_f = sum_f % number_of_processors;
+ }
+ key_partition[p] = num_keys;
+
+ quotient = radix / number_of_processors;
+ remainder = radix % number_of_processors;
+ sum_i = 0;
+ sum_f = 0;
+ p = 0;
+ diag_printf("rank_partition\n",max_key);
+ while (sum_i < radix) {
+ rank_partition[p] = sum_i;
+ p++;
+ sum_i = sum_i + quotient;
+ sum_f = sum_f + remainder;
+ sum_i = sum_i + sum_f / number_of_processors;
+ sum_f = sum_f % number_of_processors;
+ }
+ rank_partition[p] = radix;
+
+/* POSSIBLE ENHANCEMENT: Here is where one might distribute the key,
+ rank_me, rank, and gp data structures across physically
+ distributed memories as desired.
+
+ One way to place data is as follows:
+
+ for (i=0;i<number_of_processors;i++) {
+ Place all addresses x such that:
+ &(key[0][key_partition[i]]) <= x < &(key[0][key_partition[i+1]])
+ on node i
+ &(key[1][key_partition[i]]) <= x < &(key[1][key_partition[i+1]])
+ on node i
+ &(rank_me[i][0]) <= x < &(rank_me[i][radix-1]) on node i
+ &(gp[i]) <= x < &(gp[i+1]) on node i
+ &(gp[i].rank_ff[0]) <= x < &(gp[i].rank_ff[radix]) on node i
+ }
+ start_p = 0;
+ i = 0;
+
+ for (toffset = 0; toffset < number_of_processors; toffset ++) {
+ offset = toffset;
+ level = number_of_processors >> 1;
+ base = number_of_processors;
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ index = base + offset;
+ Place all addresses x such that:
+ &(global->prefix_tree[index]) <= x <
+ &(global->prefix_tree[index + 1]) on node toffset
+ base += level;
+ level >>= 1;
+ }
+ } */
+
+ /* Fill the random-number array. */
+
+ for (i = 1; i < number_of_processors; i++) {
+
+{
+ /*---------------------------------------*/
+ stack[i] = calloc(STACK_SIZE, 1);
+ cyg_thread_create(10, // Priority - just a number
+ slave_sort, // entry
+ 0, // index
+ "slave", // Name
+ stack[i], // Stack
+ STACK_SIZE, // Size
+ &threadh[i], // Handle
+ &thread[i] // Thread data structure
+ );
+ cyg_thread_resume( threadh[i] );
+ /*---------------------------------------*/
+}
+
+ }
+
+ slave_sort();
+
+ for (i = 1; i < number_of_processors; i++) {
+ while(thread_array[i]) {};
+ }
+
+ diag_printf("\n");
+ diag_printf(" PROCESS STATISTICS\n");
+ diag_printf(" Total Rank Sort\n");
+ diag_printf(" Proc Time Time Time\n");
+ diag_printf(" 0 %10.0f %10.0f %10.0f\n",
+ global->totaltime[0],global->ranktime[0],
+ global->sorttime[0]);
+ if (dostats) {
+ maxt = avgt = mint = global->totaltime[0];
+ maxrank = avgrank = minrank = global->ranktime[0];
+ maxsort = avgsort = minsort = global->sorttime[0];
+ for (i=1; i<number_of_processors; i++) {
+ if (global->totaltime[i] > maxt) {
+ maxt = global->totaltime[i];
+ }
+ if (global->totaltime[i] < mint) {
+ mint = global->totaltime[i];
+ }
+ if (global->ranktime[i] > maxrank) {
+ maxrank = global->ranktime[i];
+ }
+ if (global->ranktime[i] < minrank) {
+ minrank = global->ranktime[i];
+ }
+ if (global->sorttime[i] > maxsort) {
+ maxsort = global->sorttime[i];
+ }
+ if (global->sorttime[i] < minsort) {
+ minsort = global->sorttime[i];
+ }
+ avgt += global->totaltime[i];
+ avgrank += global->ranktime[i];
+ avgsort += global->sorttime[i];
+ }
+ avgt = avgt / number_of_processors;
+ avgrank = avgrank / number_of_processors;
+ avgsort = avgsort / number_of_processors;
+ for (i=1; i<number_of_processors; i++) {
+ diag_printf(" %3d %10.0f %10.0f %10.0f\n",
+ i,global->totaltime[i],global->ranktime[i],
+ global->sorttime[i]);
+ }
+ diag_printf(" Avg %10.0f %10.0f %10.0f\n",avgt,avgrank,avgsort);
+ diag_printf(" Min %10.0f %10.0f %10.0f\n",mint,minrank,minsort);
+ diag_printf(" Max %10.0f %10.0f %10.0f\n",maxt,maxrank,maxsort);
+ diag_printf("\n");
+ }
+
+ diag_printf("\n");
+ global->starttime = start;
+ diag_printf(" TIMING INFORMATION\n");
+ diag_printf("Start time : %16d\n",
+ global->starttime);
+ diag_printf("Initialization finish time : %16d\n",
+ global->rs);
+ diag_printf("Overall finish time : %16d\n",
+ global->rf);
+ diag_printf("Total time with initialization : %16d\n",
+ global->rf-global->starttime);
+ diag_printf("Total time without initialization : %16d\n",
+ global->rf-global->rs);
+ diag_printf("\n");
+
+ if (doprint) {
+ printout();
+ }
+ if (test_result) {
+ test_sort(global->final);
+ }
+
+
+{
+ diag_printf("FP test end\n");
+ return 0;
+}
+;
+}
+
+void slave_sort()
+{
+ int i, j, k, kk, Ind;
+ int MyNum;
+ int this_key;
+ int tmp;
+ int last_key;
+ int loopnum;
+ double ran_num;
+ double sum;
+ int shiftnum;
+ int bb;
+ int my_key;
+ int key_start;
+ int key_stop;
+ int rank_start;
+ int rank_stop;
+ int from=0;
+ int to=1;
+ int *key_density; /* individual processor key densities */
+ unsigned int time1;
+ unsigned int time2;
+ unsigned int time3;
+ unsigned int time4;
+ unsigned int time5;
+ unsigned int time6;
+ double ranktime=0;
+ double sorttime=0;
+ int *key_from;
+ int *key_to;
+ int *rank_me_mynum;
+ int *rank_me_i;
+ int *rank_ff_mynum;
+ int stats;
+ struct prefix_node* n;
+ struct prefix_node* r;
+ struct prefix_node* l;
+ struct prefix_node* my_node;
+ struct prefix_node* their_node;
+ volatile int* prefx;
+ int index;
+ int level;
+ int base;
+ int offset;
+
+ diag_printf("SlaveSort %d\n",cyg_hal_get_current_threadid());
+
+ stats = dostats;
+
+ cyg_spinlock_spin(&global->lock_Index);
+ MyNum = global->Index;
+ global->Index++;
+ cyg_spinlock_clear(&global->lock_Index);
+
+/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
+ processors to avoid migration */
+
+ key_density = (int *) malloc(radix*sizeof(int));
+
+ /* Fill the random-number array. */
+
+ key_start = key_partition[MyNum];
+ key_stop = key_partition[MyNum + 1];
+ rank_start = rank_partition[MyNum];
+ rank_stop = rank_partition[MyNum + 1];
+ if (rank_stop == radix) {
+ rank_stop--;
+ }
+
+ diag_printf("init %d, %d-%d\n",cyg_hal_get_current_threadid(),key_start,key_stop);
+ init(key_start,key_stop,from);
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_key[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1 ,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ if ( global->barrier_key[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_key[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1 ,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_key[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_key[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1 ,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ if ( global->barrier_key[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1 ,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_key[0].lock );;
+
+ } else {
+ ( global->barrier_key[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_key[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),1 ,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_key[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+}
+
+/* POSSIBLE ENHANCEMENT: Here is where one might reset the
+ statistics that one is measuring about the parallel execution */
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_key[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ if ( global->barrier_key[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_key[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_key[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_key[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ if ( global->barrier_key[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_key[0].lock );;
+
+ } else {
+ ( global->barrier_key[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_key[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),2,global->barrier_key[0].count[0],global->barrier_key[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_key[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+}
+
+ if ((MyNum == 0) || (stats)) {
+ {
+ (time1) = cyg_current_time()*10;
+}
+
+ }
+
+/* Do 1 iteration per digit. */
+
+ rank_me_mynum = rank_me[MyNum];
+ rank_ff_mynum = gp[MyNum].rank_ff;
+ for (loopnum=0;loopnum<max_num_digits;loopnum++) {
+ shiftnum = (loopnum * log2_radix);
+ bb = (radix-1) << shiftnum;
+
+/* generate histograms based on one digit */
+
+ if ((MyNum == 0) || (stats)) {
+ {
+ (time2) = cyg_current_time()*10;
+}
+
+ }
+
+ for (i = 0; i < radix; i++) {
+ rank_me_mynum[i] = 0;
+ }
+ key_from = (int *) key[from];
+ key_to = (int *) key[to];
+ for (i=key_start;i<key_stop;i++) {
+ my_key = key_from[i] & bb;
+ my_key = my_key >> shiftnum;
+ rank_me_mynum[my_key]++;
+ }
+ key_density[0] = rank_me_mynum[0];
+ for (i=1;i<radix;i++) {
+ key_density[i] = key_density[i-1] + rank_me_mynum[i];
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_rank[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_rank[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_rank[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+
+ } else {
+ ( global->barrier_rank[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_rank[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),3,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_rank[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+}
+
+ n = &(global->prefix_tree[MyNum]);
+ for (i = 0; i < radix; i++) {
+ n->densities[i] = key_density[i];
+ n->ranks[i] = rank_me_mynum[i];
+ }
+ offset = MyNum;
+ level = number_of_processors >> 1;
+ base = number_of_processors;
+ if ((MyNum & 0x1) == 0) {
+ {/*##########*/;};
+ }
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ r = n;
+ l = n - 1;
+ index = base + offset;
+ n = &(global->prefix_tree[index]);
+ {/*##########*/;};
+ {/*##########*/;};
+ if (offset != (level - 1)) {
+ for (i = 0; i < radix; i++) {
+ n->densities[i] = r->densities[i] + l->densities[i];
+ n->ranks[i] = r->ranks[i] + l->ranks[i];
+ }
+ } else {
+ for (i = 0; i < radix; i++) {
+ n->densities[i] = r->densities[i] + l->densities[i];
+ }
+ }
+ base += level;
+ level >>= 1;
+ if ((offset & 0x1) == 0) {
+ {/*##########*/;};
+ }
+ }
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_rank[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_rank[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_rank[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+
+ } else {
+ ( global->barrier_rank[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_rank[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),4,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_rank[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+ if (MyNum != (number_of_processors - 1)) {
+ offset = MyNum;
+ level = number_of_processors;
+ base = 0;
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ }
+ my_node = &(global->prefix_tree[base + offset]);
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ while ((offset & 0x1) != 0) {
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ }
+ their_node = &(global->prefix_tree[base + offset]);
+ {/*##########*/;};
+ {/*##########*/;};
+ for (i = 0; i < radix; i++) {
+ my_node->densities[i] = their_node->densities[i];
+ }
+ } else {
+ my_node = &(global->prefix_tree[(2 * number_of_processors) - 2]);
+ }
+ offset = MyNum;
+ level = number_of_processors;
+ base = 0;
+ while ((offset & 0x1) != 0) {
+ {/*##########*/;};
+ offset >>= 1;
+ base += level;
+ level >>= 1;
+ }
+ offset = MyNum;
+ level = number_of_processors;
+ base = 0;
+ for(i = 0; i < radix; i++) {
+ rank_ff_mynum[i] = 0;
+ }
+ while (offset != 0) {
+ if ((offset & 0x1) != 0) {
+ /* Add ranks of node to your left at this level */
+ l = &(global->prefix_tree[base + offset - 1]);
+ for (i = 0; i < radix; i++) {
+ rank_ff_mynum[i] += l->ranks[i];
+ }
+ }
+ base += level;
+ level >>= 1;
+ offset >>= 1;
+ }
+ for (i = 1; i < radix; i++) {
+ rank_ff_mynum[i] += my_node->densities[i - 1];
+ }
+
+ if ((MyNum == 0) || (stats)) {
+ {
+ (time3) = cyg_current_time()*10;
+}
+;
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_rank[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_rank[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_rank[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+
+ } else {
+ ( global->barrier_rank[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_rank[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),5,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_rank[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+};
+
+ if ((MyNum == 0) || (stats)) {
+ {
+ (time4) = cyg_current_time()*10;
+}
+;
+ }
+
+ /* put it in order according to this digit */
+
+ for (i = key_start; i < key_stop; i++) {
+ this_key = key_from[i] & bb;
+ this_key = this_key >> shiftnum;
+ tmp = rank_ff_mynum[this_key];
+ key_to[tmp] = key_from[i];
+ rank_ff_mynum[this_key]++;
+ } /* i */
+
+ if ((MyNum == 0) || (stats)) {
+ {
+ (time5) = cyg_current_time()*10;
+}
+;
+ }
+
+ if (loopnum != max_num_digits-1) {
+ from = from ^ 0x1;
+ to = to ^ 0x1;
+ }
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_rank[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_rank[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_rank[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+
+ } else {
+ ( global->barrier_rank[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_rank[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),6,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_rank[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+}
+
+ if ((MyNum == 0) || (stats)) {
+ ranktime += (time3 - time2);
+ sorttime += (time5 - time4);
+ }
+ } /* for */
+
+ {
+/*---------------------------------*/
+ cyg_spinlock_spin(&( global->barrier_rank[0].lock ) );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: >b (%d:%d:%d): Enter barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] < (number_of_processors -1) ) {
+ global->barrier_rank[0].count[0]++;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ?b (%d:%d:%d): Wait\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+ { cyg_semaphore_wait(&global->barrier_rank[0].queue[0] );};
+ }
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: <b (%d:%d:%d): Exit barrier\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ if ( global->barrier_rank[0].count[0] == 0 ) {
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: ^b (%d:%d:%d): unlock barrier \n-----------\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+
+ cyg_spinlock_clear(&global->barrier_rank[0].lock );;
+
+ } else {
+ ( global->barrier_rank[0].count[0] )-- ;
+ //cyg_spinlock_clear(&global->barrier_rank[0].count_lock[0] );;
+
+ if (dbg_on) {
+ diag_printf("<cpu%d> thread %d: @b (%d:%d:%d): unlock queue\n",cyg_hal_get_current_cpuid(),cyg_hal_get_current_threadid(),7,global->barrier_rank[0].count[0],global->barrier_rank[0].queue[0].count);
+ }
+ { cyg_semaphore_post(&global->barrier_rank[0].queue[0] );};
+ }
+
+/*---------------------------------*/
+}
+ if ((MyNum == 0) || (stats)) {
+ {
+ (time6) = cyg_current_time()*10;
+}
+
+ global->ranktime[MyNum] = ranktime;
+ global->sorttime[MyNum] = sorttime;
+ global->totaltime[MyNum] = time6-time1;
+ }
+ if (MyNum == 0) {
+ global->rs = time1;
+ global->rf = time6;
+ global->final = to;
+ }
+
+ thread_array[MyNum] = 0;
+}
+
+double product_mod_46(t1, t2) /* product_mod_46() returns the product
+ (mod 2^46) of t1 and t2. */
+double t1;
+double t2;
+
+{
+ double a1;
+ double b1;
+ double a2;
+ double b2;
+
+ a1 = (double)((int)(t1 / RADIX_S)); /* Decompose the arguments. */
+ a2 = t1 - a1 * RADIX_S;
+ b1 = (double)((int)(t2 / RADIX_S));
+ b2 = t2 - b1 * RADIX_S;
+ t1 = a1 * b2 + a2 * b1; /* Multiply the arguments. */
+ t2 = (double)((int)(t1 / RADIX_S));
+ t2 = t1 - t2 * RADIX_S;
+ t1 = t2 * RADIX_S + a2 * b2;
+ t2 = (double)((int)(t1 / RADIX));
+
+ return (t1 - t2 * RADIX); /* Return the product. */
+}
+
+double ran_num_init(k, b, t) /* finds the (k)th random number,
+ given the seed, b, and the ratio, t. */
+unsigned int k;
+double b;
+double t;
+
+{
+ unsigned int j;
+
+ while (k != 0) { /* while() is executed m times
+ such that 2^m > k. */
+ j = k >> 1;
+ if ((j << 1) != k) {
+ b = product_mod_46(b, t);
+ }
+ t = product_mod_46(t, t);
+ k = j;
+ }
+
+ return b;
+}
+
+int get_max_digits(max_key)
+
+int max_key;
+
+{
+ int done = 0;
+ int temp = 1;
+ int key_val;
+
+ key_val = max_key;
+ while (!done) {
+ key_val = key_val / radix;
+ if (key_val == 0) {
+ done = 1;
+ } else {
+ temp ++;
+ }
+ }
+ return temp;
+}
+
+int get_log2_radix(rad)
+
+int rad;
+
+{
+ int cumulative=1;
+ int out;
+
+ for (out = 0; out < 20; out++) {
+ if (cumulative == rad) {
+ return(out);
+ } else {
+ cumulative = cumulative * 2;
+ }
+ }
+ diag_printf("ERROR: Radix %d not a power of 2\n", rad);
+ exit(-1);
+}
+
+int get_log2_keys(num_keys)
+
+int num_keys;
+
+{
+ int cumulative=1;
+ int out;
+
+ for (out = 0; out < 30; out++) {
+ if (cumulative == num_keys) {
+ return(out);
+ } else {
+ cumulative = cumulative * 2;
+ }
+ }
+ diag_printf("ERROR: Number of keys %d not a power of 2\n", num_keys);
+ exit(-1);
+}
+
+int log_2(number)
+
+int number;
+
+{
+ int cumulative = 1;
+ int out = 0;
+ int done = 0;
+
+ while ((cumulative < number) && (!done) && (out < 50)) {
+ if (cumulative == number) {
+ done = 1;
+ } else {
+ cumulative = cumulative * 2;
+ out ++;
+ }
+ }
+
+ if (cumulative == number) {
+ return(out);
+ } else {
+ return(-1);
+ }
+}
+
+void printerr(s)
+
+char *s;
+
+{
+ diag_printf("ERROR: %s\n",s);
+}
+
+void init(key_start,key_stop,from)
+
+int key_start;
+int key_stop;
+int from;
+
+{
+ double ran_num;
+ double sum;
+ int tmp;
+ int i;
+ int *key_from;
+
+ ran_num = ran_num_init((key_start << 2) + 1, SEED, RATIO);
+ sum = ran_num / RADIX;
+ key_from = (int *) key[from];
+ for (i = key_start; i < key_stop; i++) {
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = sum + ran_num / RADIX;
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = sum + ran_num / RADIX;
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = sum + ran_num / RADIX;
+ key_from[i] = (int) ((sum / 4.0) * max_key);
+ tmp = (int) ((key_from[i])/100);
+ ran_num = product_mod_46(ran_num, RATIO);
+ sum = ran_num / RADIX;
+ }
+
+}
+
+void test_sort(final)
+
+int final;
+
+{
+ int i;
+ int mistake = 0;
+ int *key_final;
+
+ diag_printf("\n");
+ diag_printf(" TESTING RESULTS\n");
+ key_final = key[final];
+ for (i = 0; i < num_keys-1; i++) {
+ if (key_final[i] > key_final[i + 1]) {
+ diag_printf("error with key %d, value %d %d \n",
+ i,key_final[i],key_final[i + 1]);
+ mistake++;
+ }
+ }
+
+ if (mistake) {
+ diag_printf("FAILED: %d keys out of place.\n", mistake);
+ } else {
+ diag_printf("PASSED: All keys in place.\n");
+ }
+ diag_printf("\n");
+}
+
+void printout()
+
+{
+ int i;
+ int mistake;
+ int *key_final;
+
+ key_final = (int *) key[global->final];
+ diag_printf("\n");
+ diag_printf(" SORTED KEY VALUES\n");
+ diag_printf("%8d ",key_final[0]);
+ for (i = 0; i < num_keys-1; i++) {
+ diag_printf("%8d ",key_final[i+1]);
+ if ((i+2)%5 == 0) {
+ diag_printf("\n");
+ }
+ }
+ diag_printf("\n");
+}
+
+#include <tests_smp/getopt.c>
diff --git a/cesar/ecos/packages/kernel/current/tests_smp/radix_arg.c b/cesar/ecos/packages/kernel/current/tests_smp/radix_arg.c
new file mode 100644
index 0000000000..6858edd616
--- /dev/null
+++ b/cesar/ecos/packages/kernel/current/tests_smp/radix_arg.c
@@ -0,0 +1,12 @@
+//___________________________________________
+#define _str_HAL_SMP_CPU_MAX(x) #x
+#define smp_cyg_test_argc 5
+char *_cyg_argv[] = {
+ "radix.exe",
+ "-p",
+ "2", /* not used, using HAL_SMP_CPU_MAX */
+ "-m",
+ "10"
+};
+#define smp_cyg_test_argv &_cyg_argv
+//'''''''''''''''''''''''''''''''''''''''''''