Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add some speed-up tricks #27

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ cmake_minimum_required(VERSION 3.12)
project(hellocmake LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)
SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O3 -Wall -g -ggdb")
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

add_executable(main main.cpp)
target_compile_options(main PUBLIC -ffast-math -march=native)
244 changes: 156 additions & 88 deletions main.cpp
Original file line number Diff line number Diff line change
@@ -1,88 +1,156 @@
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>

float frand() {
return (float)rand() / RAND_MAX * 2 - 1;
}

struct Star {
float px, py, pz;
float vx, vy, vz;
float mass;
};

std::vector<Star> stars;

void init() {
for (int i = 0; i < 48; i++) {
stars.push_back({
frand(), frand(), frand(),
frand(), frand(), frand(),
frand() + 1,
});
}
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
for (auto &star: stars) {
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
d2 *= sqrt(d2);
star.vx += dx * other.mass * G * dt / d2;
star.vy += dy * other.mass * G * dt / d2;
star.vz += dz * other.mass * G * dt / d2;
}
}
for (auto &star: stars) {
star.px += star.vx * dt;
star.py += star.vy * dt;
star.pz += star.vz * dt;
}
}

float calc() {
float energy = 0;
for (auto &star: stars) {
float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
energy += star.mass * v2 / 2;
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
energy -= other.mass * star.mass * G / sqrt(d2) / 2;
}
}
return energy;
}

template <class Func>
long benchmark(Func const &func) {
auto t0 = std::chrono::steady_clock::now();
func();
auto t1 = std::chrono::steady_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
return dt.count();
}

int main() {
init();
printf("Initial energy: %f\n", calc());
auto dt = benchmark([&] {
for (int i = 0; i < 100000; i++)
step();
});
printf("Final energy: %f\n", calc());
printf("Time elapsed: %ld ms\n", dt);
return 0;
}
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
#include <array>

inline float frand() {
return (float)rand() / RAND_MAX * 2 - 1;
}
// AoS:Array of struct
//struct Star {
// float px, py, pz;
// float vx, vy, vz;
// float mass;
//};
// SoA:struct of array
const int N = 48;
template<int N=48>
struct Star {
std::array<float,N> px,py,pz;
std::array<float,N> vx,vy,vz,mass;
};

//std::vector<Star> stars;
Star<N> stars;

void init() {
// for (size_t i = 0; i < 48; i++) {
// stars.push_back({
// frand(), frand(), frand(),
// frand(), frand(), frand(),
// frand() + 1,
// });
// }
#pragma GCC unroll 4
for(size_t i = 0;i < 48;i++){
stars.px[i] = frand();
stars.py[i] = frand();
stars.pz[i] = frand();
stars.vx[i] = frand();
stars.vy[i] = frand();
stars.vz[i] = frand();
stars.mass[i] = frand() + 1;
}

}

const float G = 0.001;
const float eps = 0.001;
const float dt = 0.01;

void step() {
// for (auto &star: stars) {
// for (auto &other: stars) {
// float dx = other.px - star.px;
// float dy = other.py - star.py;
// float dz = other.pz - star.pz;
// float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
// d2 *= sqrt(d2);
// float inve_d2 = 1.0 / d2;
//// star.vx += dx * other.mass * G * dt / d2;
//// star.vy += dy * other.mass * G * dt / d2;
//// star.vz += dz * other.mass * G * dt / d2;
// star.vx += dx * other.mass * G * dt * inve_d2;
// star.vy += dy * other.mass * G * dt * inve_d2;
// star.vz += dz * other.mass * G * dt * inve_d2;
// }
// }
float gt = G * dt;
#pragma GCC unroll 4
for(size_t i = 0;i < N;i++){
for(size_t j = 0;j < N;j++){
float dx = stars.px[j] - stars.px[i];
float dy = stars.py[j] - stars.py[i];
float dz = stars.pz[j] - stars.pz[i];
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
d2 *= std::sqrt(d2);
float inve_d2 = gt / d2;
// star.vx += dx * other.mass * G * dt / d2;
// star.vy += dy * other.mass * G * dt / d2;
// star.vz += dz * other.mass * G * dt / d2;
stars.vx[i] += dx * stars.mass[j] * inve_d2;
stars.vy[i] += dy * stars.mass[j] * inve_d2;
stars.vz[i] += dz * stars.mass[j] * inve_d2;
}
}


// for (auto &star: stars) {
// star.px += star.vx * dt;
// star.py += star.vy * dt;
// star.pz += star.vz * dt;
// }
#pragma GCC unroll 4
for(size_t k = 0;k < N;k++){
stars.px[k] += stars.vx[k] * dt;
stars.py[k] += stars.vy[k] * dt;
stars.pz[k] += stars.vz[k] * dt;
}


}

float calc() {
float energy = 0;
// for (auto &star: stars) {
// float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
//// energy += star.mass * v2 / 2;
// energy += star.mass * v2 * 0.5;
// for (auto &other: stars) {
// float dx = other.px - star.px;
// float dy = other.py - star.py;
// float dz = other.pz - star.pz;
// float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
//// energy -= other.mass * star.mass * G / sqrt(d2) / 2;
// energy -= other.mass * star.mass * G / sqrt(d2) * 0.5;
// }
// }
// return energy;
const float eps2 = eps * eps;
#pragma GCC unroll 4
for(size_t i = 0;i < N;i++){
float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
energy += stars.mass[i] * v2 * 0.5;
for(size_t j = 0;j < N;j++) {
float dx = stars.px[j] - stars.px[i];
float dy = stars.py[j] - stars.py[i];
float dz = stars.pz[j] - stars.pz[i];
float d2 = dx * dx + dy * dy + dz * dz + eps2;
energy -= stars.mass[j] * stars.mass[i] * G / std::sqrt(d2) * 0.5;
}
}
return energy;
}

template <class Func>
long benchmark(Func const &func) {
auto t0 = std::chrono::steady_clock::now();
func();
auto t1 = std::chrono::steady_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
return dt.count();
}

int main() {
init();
printf("Initial energy: %f\n", calc());
auto dt = benchmark([&] {
for (size_t i = 0; i < 100000; i++)
step();
});
printf("Final energy: %f\n", calc());
printf("Time elapsed: %ld ms\n", dt);
return 0;
}
88 changes: 75 additions & 13 deletions score.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,75 @@
目前 (01/12) 同学们作业在老师电脑上的跑分 (毫秒):

原版: 1107
#1: 113
#2: 116
#3: 65
#4: 125
#5: 243
#6: 114
#11: 114
#12: 161
#13: 83
#14: 121
原始版本
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 6285 ms

-o1 优化
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2659 ms

-o2
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2558 ms

-o3
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2560 ms

索引int变成size_t
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2560 ms

inline frand函数
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2593 ms

将/2改成*0.5
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2585 ms


将公共除法变成乘法
Initial energy: -13.414000
Final energy: -13.356842
Time elapsed: 2030 ms


-ffast-math 选项定义了预处理器宏 __FAST_MATH__, 指示编译不必遵循 IEEE 和 ISO 的浮点运算标准
-march=native,GCC会自动检测你的CPU支持的指令集。
Initial energy: -13.414000
Final energy: -13.356841
Time elapsed: 1606 ms


SoA 阵列结构 struct of array
Initial energy: -13.414000
Final energy: -13.356841
Time elapsed: 1630 ms

sqrt前+std:: 使用模板完成传入参数匹配
Initial energy: -13.414000
Final energy: -13.356841
Time elapsed: 1220 ms

#pragma GCC unroll 4
Initial energy: -13.414000
Final energy: -13.356841
Time elapsed: 1212 ms

G * dt / d2 提取到循环外面计算
Initial energy: -13.414000
Final energy: -13.356841
Time elapsed: 1096 ms


final result:
6285 ms / 1096 ms = 5.734