C++中监视线程卡死并自动崩溃退出 WatchDog

之前写过在Python中监视卡死崩溃退出并打印卡死处的调用堆栈

在此记录一下C++的版本,不过没有在代码层面实现堆栈打印,可以通过core dump和gdb来查看崩溃时的堆栈

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// WatchDog.h
#pragma once

#include <cstdint>
#include <mutex>
#include <thread>
#include <atomic>
#include <condition_variable>

class WatchDog {
public:
WatchDog(int timeout=10, bool echo=false); // seconds
~WatchDog() { stop(); }
void stop();
void kick();

private:
void dog();
void bark();

private:
const int _timeout;
const int _echo;
std::atomic<int64_t> _last_kicked_ts;

std::mutex _mutex;
bool _stopped; // protected by _mutex
std::condition_variable _cond; // protected by _mutex

std::thread _dog;
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// WatchDog.cpp
#include "WatchDog.h"
#include <iostream>

using namespace std;

namespace {

int64_t get_gmtime_us(){
std::chrono::system_clock clock;
return std::chrono::duration_cast<std::chrono::microseconds>(
clock.now().time_since_epoch()).count();
}

} // namespace


WatchDog::WatchDog(int timeout, bool echo)
: _timeout(timeout)
, _echo(echo)
, _last_kicked_ts(get_gmtime_us())
, _stopped(false)
, _dog(&WatchDog::dog, this) {
}


void WatchDog::stop() {
do {
std::unique_lock<std::mutex> lock(_mutex);
_stopped = true;
_cond.notify_one(); // wake up the dog
} while(false);

try {
_dog.join();
}
catch (...) {
// it's ok, could already be dead
}
}

void WatchDog::kick() {
_last_kicked_ts = get_gmtime_us();
}

void WatchDog::dog() {
std::unique_lock<std::mutex> lock(_mutex);
while (true) {
if (_stopped) return;

int64_t ts = get_gmtime_us();
if (ts - _last_kicked_ts > _timeout * 1000000) {
bark();
}

if (_echo) {
std::cout << "Successful dog check"
<< " [ts] " << ts
<< " [last_kicked_ts] " << _last_kicked_ts << std::endl;
}

// wake up when notified, or every N seconds
int n = std::max(_timeout / 3, 1);
_cond.wait_for(lock, std::chrono::seconds(n));
}
}

void WatchDog::bark() {
if (_echo) {
std::cout << "\n!!!!! WATCH DOG FAILURE TRIGGERED !!!!!" << std::endl;
}
abort();
}