Skip to content

Commit b75aeee

Browse files
2x Speedup on bench_avx512
0 parents  commit b75aeee

8 files changed

+426
-0
lines changed

Makefile

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
CC=clang
2+
CFLAGS=-O3 -Wall -Wextra
3+
AVX_FLAGS=-mavx512f
4+
NASM=nasm
5+
NASMFLAGS=-f elf64
6+
7+
all: benchmark_scalar benchmark_avx512
8+
9+
benchmark_scalar: bench/benchmark_scalar.c
10+
$(CC) $(CFLAGS) $^ -o bench/$@
11+
12+
# First assemble the NASM file, then compile and link with clang
13+
bench/vec_impl.o: bench/vec_impl.nasm
14+
$(NASM) $(NASMFLAGS) $< -o $@
15+
16+
benchmark_avx512: bench/benchmark_avx512.c bench/vec_impl.o
17+
$(CC) $(CFLAGS) $(AVX_FLAGS) $^ -o bench/$@
18+
19+
clean:
20+
rm -f bench/benchmark_scalar bench/benchmark_avx512 bench/bench/*.o
21+
22+
run_scalar: benchmark_scalar
23+
./bench/benchmark_scalar
24+
25+
run_avx512: benchmark_avx512
26+
./bench/benchmark_avx512
27+
28+
.PHONY: all clean run_scalar run_avx512

bench/benchmark.c

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <stdint.h>
4+
#include <x86intrin.h>
5+
#include <time.h>
6+
7+
extern void vec_assign_ones(short* x);
8+
9+
void scalar_assign_ones(short* x) {
10+
for (int i = 0; i < 32; i += 2) {
11+
x[i] = 1;
12+
}
13+
}
14+
15+
int verify_array(short* arr) {
16+
for (int i = 0; i < 32; i++) {
17+
if ((i % 2 == 0 && arr[i] != 1) ||
18+
(i % 2 == 1 && arr[i] != 0)) {
19+
return 0;
20+
}
21+
}
22+
return 1;
23+
}
24+
25+
void reset_array(short* arr) {
26+
for (int i = 0; i < 32; i++) {
27+
arr[i] = 0;
28+
}
29+
}
30+
31+
void print_array(short* arr, int n) {
32+
for (int i = 0; i < n; i++) {
33+
printf("%d ", arr[i]);
34+
if ((i + 1) % 8 == 0) printf("\n");
35+
}
36+
printf("\n");
37+
}
38+
39+
static inline uint64_t rdtsc(void) {
40+
unsigned int lo, hi;
41+
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
42+
return ((uint64_t)hi << 32) | lo;
43+
}
44+
45+
static inline uint64_t rdtscp(void) {
46+
unsigned int lo, hi, aux;
47+
__asm__ __volatile__ ("rdtscp" : "=a" (lo), "=d" (hi), "=c" (aux));
48+
return ((uint64_t)hi << 32) | lo;
49+
}
50+
51+
static inline void cpu_serialize(void) {
52+
_mm_lfence();
53+
}
54+
55+
#define NUM_ITERATIONS 1000
56+
#define ARRAY_SIZE 32
57+
#define ALIGNMENT 64
58+
#define WARMUP_ITERATIONS 0
59+
60+
int main() {
61+
printf("Total iterations: %d\n", NUM_ITERATIONS);
62+
63+
// Allocate aligned memory
64+
short* array = (short*)aligned_alloc(ALIGNMENT, ARRAY_SIZE * sizeof(short));
65+
if (!array) {
66+
printf("Memory allocation failed!\n");
67+
return 1;
68+
}
69+
70+
uint64_t start_cycles, end_cycles;
71+
struct timespec start_time, end_time;
72+
double time_taken_scalar, time_taken_vector;
73+
74+
// Test vectorized version
75+
printf("Testing AVX-512 version:\n");
76+
reset_array(array);
77+
78+
// Warmup
79+
// for (int i = 0; i < WARMUP_ITERATIONS; i++) {
80+
// vec_assign_ones(array);
81+
// }
82+
83+
// Start timing
84+
cpu_serialize();
85+
clock_gettime(CLOCK_MONOTONIC, &start_time);
86+
start_cycles = rdtscp();
87+
88+
// Main test loop
89+
for (int i = 0; i < NUM_ITERATIONS; i++) {
90+
reset_array(array);
91+
vec_assign_ones(array);
92+
}
93+
94+
// End timing
95+
end_cycles = rdtscp();
96+
cpu_serialize();
97+
clock_gettime(CLOCK_MONOTONIC, &end_time);
98+
99+
if (!verify_array(array)) {
100+
printf("AVX-512 version produced incorrect results!\n");
101+
print_array(array, ARRAY_SIZE);
102+
}
103+
104+
uint64_t vec_total_cycles = end_cycles - start_cycles;
105+
time_taken_vector = (end_time.tv_sec - start_time.tv_sec) +
106+
(end_time.tv_nsec - start_time.tv_nsec) / 1e9;
107+
108+
printf("AVX-512 version:\n");
109+
printf(" Total cycles: %lu\n", vec_total_cycles);
110+
printf(" Cycles per iteration: %.2f\n", (double)vec_total_cycles / NUM_ITERATIONS);
111+
printf(" Total time: %.9f seconds\n", time_taken_vector);
112+
printf(" Time per iteration: %.2f nanoseconds\n", (time_taken_vector * 1e9) / NUM_ITERATIONS);
113+
114+
// Test scalar version
115+
printf("\nTesting scalar version:\n");
116+
reset_array(array);
117+
118+
// Warmup
119+
// for (int i = 0; i < WARMUP_ITERATIONS; i++) {
120+
// scalar_assign_ones(array);
121+
// }
122+
123+
// Start timing
124+
cpu_serialize();
125+
clock_gettime(CLOCK_MONOTONIC, &start_time);
126+
start_cycles = rdtscp();
127+
128+
// Main test loop
129+
for (int i = 0; i < NUM_ITERATIONS; i++) {
130+
reset_array(array);
131+
scalar_assign_ones(array);
132+
}
133+
134+
// End timing
135+
end_cycles = rdtscp();
136+
cpu_serialize();
137+
clock_gettime(CLOCK_MONOTONIC, &end_time);
138+
139+
if (!verify_array(array)) {
140+
printf("Scalar version produced incorrect results!\n");
141+
print_array(array, ARRAY_SIZE);
142+
}
143+
144+
uint64_t scalar_total_cycles = end_cycles - start_cycles;
145+
time_taken_scalar = (end_time.tv_sec - start_time.tv_sec) +
146+
(end_time.tv_nsec - start_time.tv_nsec) / 1e9;
147+
148+
printf("Scalar version:\n");
149+
printf(" Total cycles: %lu\n", scalar_total_cycles);
150+
printf(" Cycles per iteration: %.2f\n", (double)scalar_total_cycles / NUM_ITERATIONS);
151+
printf(" Total time: %.9f seconds\n", time_taken_scalar);
152+
printf(" Time per iteration: %.2f nanoseconds\n", (time_taken_scalar * 1e9) / NUM_ITERATIONS);
153+
154+
printf("\nSpeedup:\n");
155+
printf(" Cycles: %.2fx\n", (double)scalar_total_cycles / vec_total_cycles);
156+
printf(" Time: %.2fx\n", (double)time_taken_scalar / time_taken_vector);
157+
158+
free(array);
159+
return 0;
160+
}

bench/benchmark_avx512

15.6 KB
Binary file not shown.

bench/benchmark_avx512.c

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <stdint.h>
4+
#include <x86intrin.h>
5+
#include <time.h>
6+
7+
extern void vec_assign_ones(short* x);
8+
9+
int verify_array(short* arr) {
10+
for (int i = 0; i < 32; i++) {
11+
if ((i % 2 == 0 && arr[i] != 1) ||
12+
(i % 2 == 1 && arr[i] != 0)) {
13+
return 0;
14+
}
15+
}
16+
return 1;
17+
}
18+
19+
void reset_array(short* arr) {
20+
for (int i = 0; i < 32; i++) {
21+
arr[i] = 0;
22+
}
23+
}
24+
25+
void print_array(short* arr, int n) {
26+
for (int i = 0; i < n; i++) {
27+
printf("%d ", arr[i]);
28+
if ((i + 1) % 8 == 0) printf("\n");
29+
}
30+
printf("\n");
31+
}
32+
33+
// static inline uint64_t rdtsc(void) {
34+
// unsigned int lo, hi;
35+
// __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
36+
// return ((uint64_t)hi << 32) | lo;
37+
// }
38+
39+
static inline uint64_t rdtscp(void) {
40+
unsigned int lo, hi, aux;
41+
__asm__ __volatile__ ("rdtscp" : "=a" (lo), "=d" (hi), "=c" (aux));
42+
return ((uint64_t)hi << 32) | lo;
43+
}
44+
45+
static inline void cpu_serialize(void) {
46+
_mm_lfence();
47+
}
48+
49+
#define NUM_ITERATIONS 1000
50+
#define ARRAY_SIZE 32
51+
#define ALIGNMENT 64
52+
53+
int main() {
54+
printf("AVX-512 Benchmark\n");
55+
printf("Total iterations: %d\n", NUM_ITERATIONS);
56+
57+
// Allocate aligned memory
58+
short* array = (short*)aligned_alloc(ALIGNMENT, ARRAY_SIZE * sizeof(short));
59+
if (!array) {
60+
printf("Memory allocation failed!\n");
61+
return 1;
62+
}
63+
64+
uint64_t start_cycles, end_cycles;
65+
struct timespec start_time, end_time;
66+
double time_taken;
67+
68+
// Test vectorized version
69+
printf("\nTesting AVX-512 version:\n");
70+
reset_array(array);
71+
72+
// Start timing
73+
cpu_serialize();
74+
clock_gettime(CLOCK_MONOTONIC, &start_time);
75+
start_cycles = rdtscp();
76+
77+
// Main test loop
78+
for (int i = 0; i < NUM_ITERATIONS; i++) {
79+
reset_array(array);
80+
vec_assign_ones(array);
81+
}
82+
83+
// End timing
84+
end_cycles = rdtscp();
85+
cpu_serialize();
86+
clock_gettime(CLOCK_MONOTONIC, &end_time);
87+
88+
if (!verify_array(array)) {
89+
printf("AVX-512 version produced incorrect results!\n");
90+
print_array(array, ARRAY_SIZE);
91+
}
92+
93+
uint64_t total_cycles = end_cycles - start_cycles;
94+
time_taken = (end_time.tv_sec - start_time.tv_sec) +
95+
(end_time.tv_nsec - start_time.tv_nsec) / 1e9;
96+
97+
printf("Results:\n");
98+
printf(" Total cycles: %lu\n", total_cycles);
99+
printf(" Cycles per iteration: %.2f\n", (double)total_cycles / NUM_ITERATIONS);
100+
printf(" Total time: %.9f seconds\n", time_taken);
101+
printf(" Time per iteration: %.2f nanoseconds\n", (time_taken * 1e9) / NUM_ITERATIONS);
102+
103+
free(array);
104+
return 0;
105+
}

bench/benchmark_scalar

15.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)