gdb c++ coredump 调试:查看指针和动态类型的值

一般情况下,在调试 coredump 文件时,对于基础类型或者 struct ,可以直接 print 变量 来查看对应的值,但是如果为指针,或者更复杂的情况,是个动态类型,该如何查看呢?

背景

有一个消费 Kafka 数据的 Consumer 程序因为异常数据挂掉了,对应的 core 文件幸运地被 dump 了出来,关键栈信息如下:

1
2
3
4
5
6
7
8
9
10
// ....
#10 pybind11::detail::accessor<pybind11::detail::accessor_policies::generic_item>::cast<std::string> (this=0x7ff9ee0a7790)
at bc_out/baidu/third-party/pybind11/output/include/pybind11/pytypes.h:463
#11 lbd::PythonOperator::on_tuple (this=0x5c80d90, input=..., output=0x7ff9ee0a7ac0) at baidu/ps-aladdin/streaming-topology/src/python_operator.cc:91
#12 0x00000000005dae2f in vs::common::Operator::check_data_process_on_tuple (this=0x5c80d90, msg=..., output_vector=0x7ff9ee0a7ac0)
at baidu/ps-aladdin/vs-common/core/topology/operator.h:172
#13 0x00000000005d7984 in vs::common::Operator::process (this=0x5c80d90, tid=4) at baidu/ps-aladdin/vs-common/core/topology/operator.cc:210
#14 0x00000000005d815b in operator() (__closure=0x60ed408) at baidu/ps-aladdin/vs-common/core/topology/operator.cc:254
#15 0x00000000005d8bdc in std::__invoke_impl<void, vs::common::Operator::start_process()::<lambda()> >(std::__invoke_other, struct {...} &&) (__f=...)
// ....

我们的目标是需要从中获取到 Kafka 数据的 partition 和 offset 信息,来从 Kafka 中获取异常数据的内容,以及用于线下复现问题。(原始数据为二进制压缩类型,需要 dump 到本地进行解码)


包含程序业务代码的栈为 11、12、13、14、15,其中并没有直接存储 partition 和 offset 信息的变量,找变量的过程这里就省略了。从 11 栈开始排查,看一下 partition 和 offset 信息是否存储在于某些变量中。

函数 on_tuple 的入参 input的类型为 std::shared_ptr<vs::common::OperatorMessage>,其中类型为 std::shared_ptr<RdKafka::Message> 变量 _kafka_message 存储了 Kafka 消息的原始信息。这两变量实际均为 std::shared_ptr类型,可以通过打印它的 _M_ptr 来查看内层指针的值,在前面加上 * 来查看指针指向的变量的值:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
(gdb) set print pretty on

(gdb) print input
$1 = {
<std::__shared_ptr<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2>> = {
<std::__shared_ptr_access<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>},
members of std::__shared_ptr<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2>:
_M_ptr = 0xe3a6490,
_M_refcount = {
_M_pi = 0xe3a6480
}
}, <No data fields>}

(gdb) print input._M_ptr
$2 = (std::__shared_ptr<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2>::element_type *) 0xe3a6490
(gdb) print *input._M_ptr
$3 = {
_vptr.OperatorMessage = 0x1dd08e8 <vtable for vs::common::OperatorMessage+16>,
_kafka_message = {
<std::__shared_ptr<RdKafka::Message, (__gnu_cxx::_Lock_policy)2>> = {
<std::__shared_ptr_access<RdKafka::Message, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>},
members of std::__shared_ptr<RdKafka::Message, (__gnu_cxx::_Lock_policy)2>:
_M_ptr = 0x8af8c80,
_M_refcount = {
_M_pi = 0x1ae1e400
}
}, <No data fields>},
// .....
}


(gdb) print input._M_ptr->_kafka_message._M_ptr
$5 = (std::__shared_ptr<RdKafka::Message, (__gnu_cxx::_Lock_policy)2>::element_type *) 0x8af8c80
(gdb) print *input._M_ptr->_kafka_message._M_ptr
$6 = {
_vptr.Message = 0x1df0258 <vtable for RdKafka::MessageImpl+16>
}

但不幸的是,由于 RdKafka::Message是个动态类型,实际的实例是 RdKafka::MessageImpl,所以直接打印看不到其中的成员,此时可以设置 set print object on 强制打印变量真实类型(dynamic_cast类型强转后,最终打印出的值不太对,没细纠是啥原因):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
(gdb) set print object on
(gdb) print *input._M_ptr->_kafka_message._M_ptr
$12 = (RdKafka::MessageImpl) {
<RdKafka::Message> = {
_vptr.Message = 0x1df0258 <vtable for RdKafka::MessageImpl+16>
},
members of RdKafka::MessageImpl:
topic_ = 0x58b8a60,
rkmessage_ = 0xc774e38,
free_rkmessage_ = true,
rkmessage_err_ = {
err = RD_KAFKA_RESP_ERR_OFFSET_OUT_OF_RANGE,
rkt = 0x5f6164626d616c5f,
partition = 1986290284,
payload = 0x6863735f7374726f,
len = 3472943151872042085,
key = 0x3131393238303532,
key_len = 7234014019578574644,
offset = 8390891584507309673,
_private = 0x7564656863735f73
},
key_ = 0x15240dd8,
headers_ = 0x0,
rk_type_ = RD_KAFKA_CONSUMER
}

翻了一下 librdkafka 的源码,partition 和 offset 的信息是存储在变量 rkmessage_ 中的,需要获取到它的值。但是由于 _kafka_message 实际的类型为 RdKafka::Message ,其中不包含变量 rkmessage_,所以没法直接用 print 来打印

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
(gdb) print *(input._M_ptr->_kafka_message._M_ptr).rkmessage_
There is no member or method named rkmessage_.

(gdb) ptype *(input._M_ptr._kafka_message._M_ptr)
type = class RdKafka::Message {
public:
virtual std::string errstr(void) const;
virtual RdKafka::ErrorCode err(void) const;
virtual RdKafka::Topic * topic(void) const;
virtual std::string topic_name(void) const;
virtual int32_t partition(void) const;
virtual void * payload(void) const;
virtual size_t len(void) const;
virtual const std::string * key(void) const;
virtual const void * key_pointer(void) const;
virtual size_t key_len(void) const;
virtual int64_t offset(void) const;
virtual RdKafka::MessageTimestamp timestamp(void) const;
virtual void * msg_opaque(void) const;
~Message();
virtual int64_t latency(void) const;
virtual rd_kafka_message_s * c_ptr(void);
virtual RdKafka::Message::Status status(void) const;
virtual RdKafka::Headers * headers(void);
virtual RdKafka::Headers * headers(RdKafka::ErrorCode *);
virtual int32_t broker_id(void) const;
}

可以基于指针地址进行打印,根据源码,变量 rkmessage_ 的类型为 rd_kafka_message_t*,地址为 0xc774e38(见 print *input._M_ptr->_kafka_message._M_ptr的打印结果),那么可以这样 print,可以看到其中的 partition 和 offset 值是正常的,最后也顺利根据这个信息拿到了问题数据并进行了本地问题复现:

1
2
3
4
5
6
7
8
9
10
11
12
(gdb) print *(rd_kafka_message_t *)0xc774e38
$16 = {
err = RD_KAFKA_RESP_ERR_NO_ERROR,
rkt = 0x55821c0,
partition = 1,
payload = 0x77c5044,
len = 1851,
key = 0x77c5006,
key_len = 60,
offset = 76573191,
_private = 0xc774dc0
}

进一步探索

对于其他变量也是同理,例如根据源码, key_ 的类型为 std::string *,那么也可以通过类似的方式查看对应的值:

1
2
3
4
5
6
7
8
9
10
(gdb) print *(std::string *)0x15240dd8
$17 = {
static npos = 18446744073709551615,
_M_dataplus = {
<std::allocator<char>> = {
<std::__new_allocator<char>> = {<No data fields>}, <No data fields>},
members of std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider:
_M_p = 0x1a667f18 "http://kg.baidu.com/rawbase/012226d6ed5807455d7e402b801a31cd"
}
}

甚至可以直接用 ptype 打印其包含的成员:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
(gdb) ptype *(RdKafka::MessageImpl *)0x8af8c80
type = class RdKafka::MessageImpl : public RdKafka::Message {
public:
RdKafka::Topic *topic_;
rd_kafka_message_t *rkmessage_;
bool free_rkmessage_;
rd_kafka_message_t rkmessage_err_;
std::string *key_;
private:
RdKafka::Headers *headers_;
const rd_kafka_type_t rk_type_;

public:
~MessageImpl();
MessageImpl(rd_kafka_type_t, RdKafka::Topic *, rd_kafka_message_t *);
MessageImpl(rd_kafka_type_t, RdKafka::Topic *, rd_kafka_message_t *, bool);
MessageImpl(rd_kafka_type_t, rd_kafka_message_t *);
MessageImpl(rd_kafka_type_t, RdKafka::Topic *, RdKafka::ErrorCode);
private:
MessageImpl(const RdKafka::MessageImpl &);
public:
virtual std::string errstr(void) const;
virtual RdKafka::ErrorCode err(void) const;
virtual RdKafka::Topic * topic(void) const;
virtual std::string topic_name(void) const;
virtual int32_t partition(void) const;
virtual void * payload(void) const;
virtual size_t len(void) const;
virtual const std::string * key(void) const;
virtual const void * key_pointer(void) const;
--Type <RET> for more, q to quit, c to continue without paging--
virtual size_t key_len(void) const;
virtual int64_t offset(void) const;
virtual RdKafka::MessageTimestamp timestamp(void) const;
virtual void * msg_opaque(void) const;
virtual int64_t latency(void) const;
virtual rd_kafka_message_s * c_ptr(void);
virtual RdKafka::Message::Status status(void) const;
virtual RdKafka::Headers * headers(void);
virtual RdKafka::Headers * headers(RdKafka::ErrorCode *);
virtual int32_t broker_id(void) const;
private:
RdKafka::MessageImpl & operator=(const RdKafka::MessageImpl &);
}

但是相同的地址,用 std::string 类型打印,虽然没有报错,但是内容就不对了

1
2
3
4
5
6
7
8
9
10
(gdb) print *(std::string *)0x8af8c80
$18 = {
static npos = 18446744073709551615,
_M_dataplus = {
<std::allocator<char>> = {
<std::__new_allocator<char>> = {<No data fields>}, <No data fields>},
members of std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider:
_M_p = 0x1df0258 <vtable for RdKafka::MessageImpl+16> "\200e`"
}
}

给人的感觉就是,给一个指针地址,然后用指定的类型来匹配地址中存储的数据内容,有点粗暴,hh