Skip to content

Commit e9e4605

Browse files
committed
2 parents 0a1a0ee + dcb065b commit e9e4605

File tree

4 files changed

+238
-101
lines changed

4 files changed

+238
-101
lines changed

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717
"colsb",
1818
"consteval",
1919
"coro",
20+
"cplusplus",
2021
"cppcoro",
2122
"CTRE",
2223
"CUDA",
2324
"denormal",
2425
"DOTPROD",
2526
"Dusíková",
2627
"Eigen",
28+
"Eron",
2729
"excerise",
2830
"fconcepts",
2931
"Fedor",
@@ -53,6 +55,7 @@
5355
"Niels",
5456
"nlohmann",
5557
"NVCC",
58+
"openblas",
5659
"openmp",
5760
"Ormrod",
5861
"Peta",

CMakeLists.txt

+74-35
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Let's used CMake 3.16+ for native sanitizers support
2-
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
2+
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
33

44
# ------------------------------------------------------------------------------
55
# Project Setup
@@ -32,17 +32,25 @@ endif()
3232
# ------------------------------------------------------------------------------
3333
find_package(Threads REQUIRED)
3434
find_package(OpenMP REQUIRED)
35-
find_package(BLAS REQUIRED)
36-
if (BLAS_FOUND)
37-
message(STATUS "BLAS found: ${BLAS_LIBRARIES}")
38-
else ()
39-
message(FATAL_ERROR "BLAS not found")
40-
endif ()
41-
4235

4336
set(FETCHCONTENT_QUIET OFF)
4437
include(FetchContent)
4538

39+
# Fetch and build OpenBLAS
40+
FetchContent_Declare(
41+
OpenBLAS
42+
GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git
43+
GIT_TAG v0.3.29
44+
)
45+
46+
# Set OpenBLAS build options
47+
set(NOFORTRAN ON CACHE BOOL "Disable Fortran" FORCE)
48+
set(BUILD_WITHOUT_LAPACK OFF CACHE BOOL "Build without LAPACK" FORCE)
49+
set(USE_THREAD ON CACHE BOOL "Use threading" FORCE)
50+
51+
# Make OpenBLAS available
52+
FetchContent_MakeAvailable(OpenBLAS)
53+
4654
# GTest (required by Google Benchmark)
4755
FetchContent_Declare(
4856
GoogleTest
@@ -105,7 +113,7 @@ endif()
105113
FetchContent_Declare(
106114
VictorZverovichFMT
107115
GIT_REPOSITORY https://github.com/fmtlib/fmt.git
108-
GIT_TAG 11.1.0
116+
GIT_TAG 11.1.2
109117
)
110118
FetchContent_MakeAvailable(VictorZverovichFMT)
111119

@@ -189,19 +197,23 @@ add_executable(less_slow less_slow.cpp)
189197
set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON)
190198

191199
# Conditionally add the assembly file(s)
192-
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
193-
set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM)
194-
target_sources(less_slow PRIVATE less_slow_amd64.S)
195-
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
196-
set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
197-
target_sources(less_slow PRIVATE less_slow_aarch64.S)
200+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64")
201+
set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM)
202+
target_sources(less_slow PRIVATE less_slow_amd64.S)
203+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
204+
set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
205+
target_sources(less_slow PRIVATE less_slow_aarch64.S)
198206
endif()
199207

200208
# ------------------------------------------------------------------------------
201209
# Compiler Flags / Options
202210
# ------------------------------------------------------------------------------
203-
if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
204-
# Apple Clang doesn't support -march=native
211+
# Check for compiler support of `-march=native`
212+
if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
213+
target_compile_options(less_slow PRIVATE -xHost)
214+
elseif(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang" OR CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
215+
# Apple's Clang and MSVC can't auto-detect the highest CPU features
216+
else()
205217
target_compile_options(less_slow PRIVATE -march=native)
206218
endif()
207219

@@ -213,8 +225,15 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
213225
-fconcepts-diagnostics-depth=10 # Needed to debug concepts
214226
-fopenmp # OpenMP support, also requires linking
215227
)
228+
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
229+
target_compile_options(less_slow PRIVATE
230+
/MP # Build with multiple processes; equivalent to `make -j` except it spans across all cores by default
231+
/wd4068 # Disable the "unknown pragma" warning, as StringZilla uses many GCC and Clang pragmas
232+
/Zc:__cplusplus # Make `__cplusplus` macro actually match used standard
233+
/Zc:preprocessor # Use conformant preprocessor
234+
)
235+
216236
else()
217-
# For other compilers (Clang, MSVC, Intel, etc.)
218237
target_compile_options(less_slow PRIVATE
219238
-Wno-deprecated-pragma
220239
)
@@ -230,28 +249,48 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang"
230249
)
231250
set_property(TARGET less_slow PROPERTY SANITIZE_ADDRESS TRUE)
232251
set_property(TARGET less_slow PROPERTY SANITIZE_UNDEFINED TRUE)
252+
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
253+
target_compile_options(less_slow PRIVATE
254+
$<$<CONFIG:Release>:/O2>
255+
$<$<CONFIG:Release>:/Ob2>
256+
$<$<CONFIG:Release>:/Oi>
257+
$<$<CONFIG:Release>:/Ot>
258+
$<$<CONFIG:Release>:/GL>
259+
)
260+
target_link_options(less_slow PRIVATE
261+
$<$<CONFIG:Release>:/LTCG:incremental>
262+
)
233263
endif()
234264

235265
# ------------------------------------------------------------------------------
236266
# Link Libraries
237267
# ------------------------------------------------------------------------------
268+
# Add OpenBLAS include directory manually
269+
if(openblas_POPULATED)
270+
target_include_directories(less_slow PRIVATE ${openblas_SOURCE_DIR})
271+
272+
# For config.h
273+
target_include_directories(less_slow PRIVATE ${openblas_BINARY_DIR})
274+
endif()
275+
238276
target_link_libraries(less_slow
239277
PRIVATE
240-
Threads::Threads
241-
benchmark
242-
fmt::fmt
243-
range-v3
244-
cppcoro
245-
unifex
246-
stringzilla
247-
yyjson
248-
ctre
249-
# There is no `absl` shortcut:
250-
# https://github.com/abseil/abseil-cpp/blob/master/CMake/README.md#available-abseil-cmake-public-targets
251-
absl::flat_hash_map
252-
nlohmann_json::nlohmann_json
253-
Eigen3::Eigen
254-
${BLAS_LIBRARIES}
255-
$<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>:TBB::tbb>
256-
$<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>:OpenMP::OpenMP_CXX>
278+
Threads::Threads
279+
benchmark
280+
fmt::fmt
281+
range-v3
282+
cppcoro
283+
unifex
284+
stringzilla
285+
yyjson
286+
ctre
287+
openblas
288+
289+
# There is no `absl` shortcut:
290+
# https://github.com/abseil/abseil-cpp/blob/master/CMake/README.md#available-abseil-cmake-public-targets
291+
absl::flat_hash_map
292+
nlohmann_json::nlohmann_json
293+
Eigen3::Eigen
294+
$<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>:TBB::tbb>
295+
OpenMP::OpenMP_CXX
257296
)

README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,17 @@ Some of the highlights include:
3030
- __Is the pointer size really 64 bits__ and how to exploit [pointer-tagging](https://en.wikipedia.org/wiki/Tagged_pointer)?
3131
- __How many packets is [UDP](https://www.cloudflare.com/learning/ddos/glossary/user-datagram-protocol-udp/) dropping__ and how to serve web requests in [`io_uring`](https://en.wikipedia.org/wiki/Io_uring) from user-space?
3232
- __Scatter and Gather__ for 50% faster vectorized disjoint memory operations.
33-
- __How to choose between intrinsics, inline Assembly, and separate Assembly files__ for your performance-critical code?
33+
- __How to choose between intrinsics, inline Assembly, and separate `.S` files__ for your performance-critical code?
3434
- __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜
3535

3636
To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments.
3737
Follow the instructions below to run the code in your environment and compare it to the comments as you read through the source.
3838

3939
## Running the Benchmarks
4040

41+
The project aims to be compatible with GCC, Clang, and MSVC compilers on Linux, MacOS, and Windows.
42+
That said, to cover the broadest functionality, using GCC on Linux is recommended:
43+
4144
- If you are on Windows, it's recommended that you set up a Linux environment using [WSL](https://docs.microsoft.com/en-us/windows/wsl/install).
4245
- If you are on MacOS, consider using the non-native distribution of Clang from [Homebrew](https://brew.sh) or [MacPorts](https://www.macports.org).
4346
- If you are on Linux, make sure to install CMake and a recent version of GCC or Clang compilers to support C++20 features.

0 commit comments

Comments
 (0)