yet-another-apply-test.c [plain text]
#include <dispatch/dispatch.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <mach/mach.h>
#include <mach/mach_time.h>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
static uint64_t total;
#define LAPS (256 * 1024 * 1024)
#define SIZE (LAPS * sizeof(int))
int
main(int argc, char *argv[])
{
dispatch_queue_t cq = dispatch_get_concurrent_queue(0);
struct stat sb;
long double cycles;
uint64_t s, e, d;
uint64_t tmp_total;
int r, fd;
const int *nums;
size_t i, stride;
if (argc != 2) {
fprintf(stderr, "usage: %s <file>\n", argv[0]);
exit(EXIT_FAILURE);
}
fd = open(argv[1], O_RDONLY);
assert(fd != -1);
r = fstat(fd, &sb);
assert(r != -1);
assert(sb.st_size >= (off_t)SIZE);
nums = mmap(NULL, SIZE, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
assert(nums != MAP_FAILED);
for (i = 0; i < LAPS; i++) {
total += nums[i];
}
for (stride = 1; stride < (LAPS + 1); stride <<= 1) {
total = 0;
s = mach_absolute_time();
dispatch_apply(LAPS / stride, cq, ^(size_t idx) {
const int *nums2 = nums + (idx * stride);
uint64_t ptotal = 0;
size_t idx2 = 0;
do {
ptotal += nums2[idx2++];
} while (idx2 < stride);
__sync_fetch_and_add(&total, ptotal);
});
e = mach_absolute_time();
d = e - s;
cycles = d;
cycles /= LAPS;
printf("da%lu:\t%Lf ns\n", stride, cycles);
}
tmp_total = 0;
total = 0;
s = mach_absolute_time();
for (i = 0; i < LAPS; i++) {
tmp_total += nums[i];
}
total = tmp_total;
e = mach_absolute_time();
d = e - s;
cycles = d;
cycles /= LAPS;
printf("naïve:\t%Lf ns\n", cycles);
tmp_total = 0;
total = 0;
s = mach_absolute_time();
#pragma omp parallel for reduction(+:tmp_total)
for (i = 0; i < LAPS; i++) {
tmp_total += nums[i];
}
total = tmp_total;
e = mach_absolute_time();
d = e - s;
cycles = d;
cycles /= LAPS;
printf("OpenMP:\t%Lf ns\n", cycles);
exit(EXIT_SUCCESS);
}