-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: simplify journal and restore streamer cancellation #4549
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,6 +84,8 @@ class OutgoingMigration::SliceSlotMigration : private ProtocolClient { | |
} | ||
|
||
void Cancel() { | ||
// We don't care about errors during cancel | ||
cntx_.SwitchErrorHandler([](auto ge) {}); | ||
// Close socket for clean disconnect. | ||
CloseSocket(); | ||
streamer_.Cancel(); | ||
|
@@ -194,13 +196,12 @@ void OutgoingMigration::SyncFb() { | |
break; | ||
} | ||
|
||
last_error_ = cntx_.GetError(); | ||
cntx_.Reset(nullptr); | ||
|
||
if (last_error_) { | ||
LOG(ERROR) << last_error_.Format(); | ||
if (cntx_.IsError()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change of the semantics here. Before: we sleep either if Now: we only sleep if there was an error. In other words, we no longer treat cancellation as error which is a mistake. Cancellation is error and that's why errno exists with: I would like us to keep those semantics for clarity There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have cancel_error as well. I don't see any problems if you want create cancel error you call reportCancelError() if you want just cancel the process you cancel the process |
||
last_error_ = cntx_.GetError(); | ||
LOG(ERROR) << last_error_; | ||
ThisFiber::SleepFor(1000ms); // wait some time before next retry | ||
} | ||
cntx_.Reset(nullptr); | ||
|
||
VLOG(1) << "Connecting to target node"; | ||
auto timeout = absl::GetFlag(FLAGS_slot_migration_connection_timeout_ms) * 1ms; | ||
|
@@ -246,7 +247,7 @@ void OutgoingMigration::SyncFb() { | |
} | ||
|
||
OnAllShards([this](auto& migration) { migration->PrepareFlow(cf_->MyID()); }); | ||
if (cntx_.GetError()) { | ||
if (cntx_.IsError()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about cancellation ? |
||
continue; | ||
} | ||
|
||
|
@@ -257,13 +258,13 @@ void OutgoingMigration::SyncFb() { | |
OnAllShards([](auto& migration) { migration->PrepareSync(); }); | ||
} | ||
|
||
if (cntx_.GetError()) { | ||
if (cntx_.IsError()) { | ||
continue; | ||
} | ||
|
||
OnAllShards([](auto& migration) { migration->RunSync(); }); | ||
|
||
if (cntx_.GetError()) { | ||
if (cntx_.IsError()) { | ||
continue; | ||
} | ||
|
||
|
@@ -273,7 +274,7 @@ void OutgoingMigration::SyncFb() { | |
VLOG(1) << "Waiting for migration to finalize..."; | ||
ThisFiber::SleepFor(500ms); | ||
} | ||
if (cntx_.GetError()) { | ||
if (cntx_.IsError()) { | ||
continue; | ||
} | ||
break; | ||
|
@@ -288,7 +289,7 @@ bool OutgoingMigration::FinalizeMigration(long attempt) { | |
LOG(INFO) << "Finalize migration for " << cf_->MyID() << " : " << migration_info_.node_info.id | ||
<< " attempt " << attempt; | ||
if (attempt > 1) { | ||
if (cntx_.GetError()) { | ||
if (cntx_.IsError()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also here what about cancle? how do we exit? |
||
return true; | ||
} | ||
auto timeout = absl::GetFlag(FLAGS_slot_migration_connection_timeout_ms) * 1ms; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -349,10 +349,6 @@ GenericError ExecutionState::GetError() const { | |
return err_; | ||
} | ||
|
||
const Cancellation* ExecutionState::GetCancellation() const { | ||
return this; | ||
} | ||
|
||
void ExecutionState::ReportCancelError() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is still being used (in replication and in other places). The issue here is that after this function is called You do not update the state == There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you report an error even if it CancelError the state will be error |
||
ReportError(std::make_error_code(errc::operation_canceled), "Context cancelled"); | ||
} | ||
|
@@ -363,7 +359,7 @@ void ExecutionState::Reset(ErrHandler handler) { | |
unique_lock lk{err_mu_}; | ||
err_ = {}; | ||
err_handler_ = std::move(handler); | ||
Cancellation::flag_.store(false, std::memory_order_relaxed); | ||
state_.store(State::RUN, std::memory_order_relaxed); | ||
fb.swap(err_handler_fb_); | ||
lk.unlock(); | ||
fb.JoinIfNeeded(); | ||
|
@@ -402,7 +398,7 @@ GenericError ExecutionState::ReportErrorInternal(GenericError&& err) { | |
// We can move err_handler_ because it should run at most once. | ||
if (err_handler_) | ||
err_handler_fb_ = fb2::Fiber("report_internal_error", std::move(err_handler_), err_); | ||
Cancellation::Cancel(); | ||
state_.store(State::ERROR, std::memory_order_relaxed); | ||
return err_; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -205,23 +205,6 @@ using AggregateStatus = AggregateValue<facade::OpStatus>; | |
static_assert(bool(facade::OpStatus::OK) == false, | ||
"Default intitialization should be a falsy OK value"); | ||
|
||
// Simple wrapper interface around atomic cancellation flag. | ||
struct Cancellation { | ||
Cancellation() : flag_{false} { | ||
} | ||
|
||
void Cancel() { | ||
flag_.store(true, std::memory_order_relaxed); | ||
} | ||
|
||
bool IsCancelled() const { | ||
return flag_.load(std::memory_order_relaxed); | ||
} | ||
|
||
protected: | ||
std::atomic_bool flag_; | ||
}; | ||
|
||
// Error wrapper, that stores error_code and optional string message. | ||
class GenericError { | ||
public: | ||
|
@@ -246,45 +229,58 @@ class GenericError { | |
// Thread safe utility to store the first non null generic error. | ||
using AggregateGenericError = AggregateValue<GenericError>; | ||
|
||
// ExecutionState is a utility for managing error reporting and cancellation for complex tasks. | ||
// | ||
// When submitting an error with `Error`, only the first is stored (as in aggregate values). | ||
// Then a special error handler is run, if present, and the ExecutionState is cancelled. The error | ||
// handler is run in a separate handler to free up the caller. | ||
// ExecutionState is a thread-safe utility for managing error reporting and cancellation for complex | ||
// tasks. There are 3 states: RUN, CANCELLED, ERROR RUN and CANCELLED are just a state without any | ||
// actions When report an error, only the first is stored, the next ones will be ignored. Then a | ||
// special error handler is run, if present, and the ExecutionState is ERROR. The error handler is | ||
// run in a separate handler to free up the caller. | ||
// | ||
// ReportCancelError() reporting an `errc::operation_canceled` error. | ||
class ExecutionState : protected Cancellation { | ||
class ExecutionState { | ||
public: | ||
using ErrHandler = std::function<void(const GenericError&)>; | ||
|
||
ExecutionState() = default; | ||
ExecutionState(ErrHandler err_handler) | ||
: Cancellation{}, err_{}, err_handler_{std::move(err_handler)} { | ||
ExecutionState(ErrHandler err_handler) : err_handler_{std::move(err_handler)} { | ||
} | ||
|
||
~ExecutionState(); | ||
|
||
// Cancels the context by submitting an `errc::operation_canceled` error. | ||
void ReportCancelError(); | ||
using Cancellation::IsCancelled; | ||
const Cancellation* GetCancellation() const; | ||
|
||
bool IsRunning() const { | ||
return state_.load(std::memory_order_relaxed) == State::RUN; | ||
} | ||
|
||
bool IsError() const { | ||
return state_.load(std::memory_order_relaxed) == State::ERROR; | ||
} | ||
|
||
bool IsCancelled() const { | ||
return state_.load(std::memory_order_relaxed) == State::CANCELLED; | ||
} | ||
|
||
void Cancel() { | ||
state_.store(State::CANCELLED, std::memory_order_relaxed); | ||
} | ||
|
||
GenericError GetError() const; | ||
|
||
// Report an error by submitting arguments for GenericError. | ||
// If this is the first error that occured, then the error handler is run | ||
// and the context is cancelled. | ||
// and the context state set to ERROR. | ||
template <typename... T> GenericError ReportError(T... ts) { | ||
return ReportErrorInternal(GenericError{std::forward<T>(ts)...}); | ||
} | ||
|
||
// Wait for error handler to stop, reset error and cancellation flag, assign new error handler. | ||
// Wait for error handler to stop, reset error and state, assign new error handler. | ||
void Reset(ErrHandler handler); | ||
|
||
// Atomically replace the error handler if no error is present, and return the | ||
// current stored error. This function can be used to transfer cleanup responsibility safely | ||
// | ||
// Beware, never do this manually in two steps. If you check for cancellation, | ||
// Beware, never do this manually in two steps. If you check the state, | ||
// set the error handler and initialize resources, then the new error handler | ||
// will never run if the context was cancelled between the first two steps. | ||
GenericError SwitchErrorHandler(ErrHandler handler); | ||
|
@@ -293,9 +289,10 @@ class ExecutionState : protected Cancellation { | |
void JoinErrorHandler(); | ||
|
||
private: | ||
// Report error. | ||
GenericError ReportErrorInternal(GenericError&& err); | ||
|
||
enum class State { RUN, CANCELLED, ERROR }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We
Cancelled operation is an error. That's why we have
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In your state machine:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you use ec_ you need mutex and it's slow. Let's have a call. |
||
std::atomic<State> state_{State::RUN}; | ||
GenericError err_; | ||
ErrHandler err_handler_; | ||
util::fb2::Fiber err_handler_fb_; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there any other places in code that we do care about error during cancel?
Should this be the caller to decide if we need error handling after cancel was sent or doesnt it makes more sense that ExecutionState will handle this i.e after cancel there is not switching to error.