1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <stdint.h>
4
+ #include <x86intrin.h>
5
+ #include <time.h>
6
+
7
+ extern void vec_assign_ones (short * x );
8
+
9
+ void scalar_assign_ones (short * x ) {
10
+ for (int i = 0 ; i < 32 ; i += 2 ) {
11
+ x [i ] = 1 ;
12
+ }
13
+ }
14
+
15
+ int verify_array (short * arr ) {
16
+ for (int i = 0 ; i < 32 ; i ++ ) {
17
+ if ((i % 2 == 0 && arr [i ] != 1 ) ||
18
+ (i % 2 == 1 && arr [i ] != 0 )) {
19
+ return 0 ;
20
+ }
21
+ }
22
+ return 1 ;
23
+ }
24
+
25
+ void reset_array (short * arr ) {
26
+ for (int i = 0 ; i < 32 ; i ++ ) {
27
+ arr [i ] = 0 ;
28
+ }
29
+ }
30
+
31
+ void print_array (short * arr , int n ) {
32
+ for (int i = 0 ; i < n ; i ++ ) {
33
+ printf ("%d " , arr [i ]);
34
+ if ((i + 1 ) % 8 == 0 ) printf ("\n" );
35
+ }
36
+ printf ("\n" );
37
+ }
38
+
39
+ static inline uint64_t rdtsc (void ) {
40
+ unsigned int lo , hi ;
41
+ __asm__ __volatile__ ("rdtsc" : "=a" (lo ), "=d" (hi ));
42
+ return ((uint64_t )hi << 32 ) | lo ;
43
+ }
44
+
45
+ static inline uint64_t rdtscp (void ) {
46
+ unsigned int lo , hi , aux ;
47
+ __asm__ __volatile__ ("rdtscp" : "=a" (lo ), "=d" (hi ), "=c" (aux ));
48
+ return ((uint64_t )hi << 32 ) | lo ;
49
+ }
50
+
51
+ static inline void cpu_serialize (void ) {
52
+ _mm_lfence ();
53
+ }
54
+
55
+ #define NUM_ITERATIONS 1000
56
+ #define ARRAY_SIZE 32
57
+ #define ALIGNMENT 64
58
+ #define WARMUP_ITERATIONS 0
59
+
60
+ int main () {
61
+ printf ("Total iterations: %d\n" , NUM_ITERATIONS );
62
+
63
+ // Allocate aligned memory
64
+ short * array = (short * )aligned_alloc (ALIGNMENT , ARRAY_SIZE * sizeof (short ));
65
+ if (!array ) {
66
+ printf ("Memory allocation failed!\n" );
67
+ return 1 ;
68
+ }
69
+
70
+ uint64_t start_cycles , end_cycles ;
71
+ struct timespec start_time , end_time ;
72
+ double time_taken_scalar , time_taken_vector ;
73
+
74
+ // Test vectorized version
75
+ printf ("Testing AVX-512 version:\n" );
76
+ reset_array (array );
77
+
78
+ // Warmup
79
+ // for (int i = 0; i < WARMUP_ITERATIONS; i++) {
80
+ // vec_assign_ones(array);
81
+ // }
82
+
83
+ // Start timing
84
+ cpu_serialize ();
85
+ clock_gettime (CLOCK_MONOTONIC , & start_time );
86
+ start_cycles = rdtscp ();
87
+
88
+ // Main test loop
89
+ for (int i = 0 ; i < NUM_ITERATIONS ; i ++ ) {
90
+ reset_array (array );
91
+ vec_assign_ones (array );
92
+ }
93
+
94
+ // End timing
95
+ end_cycles = rdtscp ();
96
+ cpu_serialize ();
97
+ clock_gettime (CLOCK_MONOTONIC , & end_time );
98
+
99
+ if (!verify_array (array )) {
100
+ printf ("AVX-512 version produced incorrect results!\n" );
101
+ print_array (array , ARRAY_SIZE );
102
+ }
103
+
104
+ uint64_t vec_total_cycles = end_cycles - start_cycles ;
105
+ time_taken_vector = (end_time .tv_sec - start_time .tv_sec ) +
106
+ (end_time .tv_nsec - start_time .tv_nsec ) / 1e9 ;
107
+
108
+ printf ("AVX-512 version:\n" );
109
+ printf (" Total cycles: %lu\n" , vec_total_cycles );
110
+ printf (" Cycles per iteration: %.2f\n" , (double )vec_total_cycles / NUM_ITERATIONS );
111
+ printf (" Total time: %.9f seconds\n" , time_taken_vector );
112
+ printf (" Time per iteration: %.2f nanoseconds\n" , (time_taken_vector * 1e9 ) / NUM_ITERATIONS );
113
+
114
+ // Test scalar version
115
+ printf ("\nTesting scalar version:\n" );
116
+ reset_array (array );
117
+
118
+ // Warmup
119
+ // for (int i = 0; i < WARMUP_ITERATIONS; i++) {
120
+ // scalar_assign_ones(array);
121
+ // }
122
+
123
+ // Start timing
124
+ cpu_serialize ();
125
+ clock_gettime (CLOCK_MONOTONIC , & start_time );
126
+ start_cycles = rdtscp ();
127
+
128
+ // Main test loop
129
+ for (int i = 0 ; i < NUM_ITERATIONS ; i ++ ) {
130
+ reset_array (array );
131
+ scalar_assign_ones (array );
132
+ }
133
+
134
+ // End timing
135
+ end_cycles = rdtscp ();
136
+ cpu_serialize ();
137
+ clock_gettime (CLOCK_MONOTONIC , & end_time );
138
+
139
+ if (!verify_array (array )) {
140
+ printf ("Scalar version produced incorrect results!\n" );
141
+ print_array (array , ARRAY_SIZE );
142
+ }
143
+
144
+ uint64_t scalar_total_cycles = end_cycles - start_cycles ;
145
+ time_taken_scalar = (end_time .tv_sec - start_time .tv_sec ) +
146
+ (end_time .tv_nsec - start_time .tv_nsec ) / 1e9 ;
147
+
148
+ printf ("Scalar version:\n" );
149
+ printf (" Total cycles: %lu\n" , scalar_total_cycles );
150
+ printf (" Cycles per iteration: %.2f\n" , (double )scalar_total_cycles / NUM_ITERATIONS );
151
+ printf (" Total time: %.9f seconds\n" , time_taken_scalar );
152
+ printf (" Time per iteration: %.2f nanoseconds\n" , (time_taken_scalar * 1e9 ) / NUM_ITERATIONS );
153
+
154
+ printf ("\nSpeedup:\n" );
155
+ printf (" Cycles: %.2fx\n" , (double )scalar_total_cycles / vec_total_cycles );
156
+ printf (" Time: %.2fx\n" , (double )time_taken_scalar / time_taken_vector );
157
+
158
+ free (array );
159
+ return 0 ;
160
+ }
0 commit comments