一般情况下,在调试 coredump 文件时,对于基础类型或者 struct ,可以直接 print 变量 来查看对应的值,但是如果为指针,或者更复杂的情况,是个动态类型,该如何查看呢?
背景 有一个消费 Kafka 数据的 Consumer 程序因为异常数据挂掉了,对应的 core 文件幸运地被 dump 了出来,关键栈信息如下:
1 2 3 4 5 6 7 8 9 10 // .... #10 pybind11::detail::accessor<pybind11::detail::accessor_policies::generic_item>::cast<std::string> (this=0x7ff9ee0a7790) at bc_out/baidu/third-party/pybind11/output/include/pybind11/pytypes.h:463 #11 lbd::PythonOperator::on_tuple (this=0x5c80d90, input=..., output=0x7ff9ee0a7ac0) at baidu/ps-aladdin/streaming-topology/src/python_operator.cc:91 #12 0x00000000005dae2f in vs::common::Operator::check_data_process_on_tuple (this=0x5c80d90, msg=..., output_vector=0x7ff9ee0a7ac0) at baidu/ps-aladdin/vs-common/core/topology/operator.h:172 #13 0x00000000005d7984 in vs::common::Operator::process (this=0x5c80d90, tid=4) at baidu/ps-aladdin/vs-common/core/topology/operator.cc:210 #14 0x00000000005d815b in operator() (__closure=0x60ed408) at baidu/ps-aladdin/vs-common/core/topology/operator.cc:254 #15 0x00000000005d8bdc in std::__invoke_impl<void, vs::common::Operator::start_process()::<lambda()> >(std::__invoke_other, struct {...} &&) (__f=...) // ....
我们的目标是需要从中获取到 Kafka 数据的 partition 和 offset 信息,来从 Kafka 中获取异常数据的内容,以及用于线下复现问题。(原始数据为二进制压缩类型,需要 dump 到本地进行解码)
print 指针 包含程序业务代码的栈为 11、12、13、14、15,其中并没有直接存储 partition 和 offset 信息的变量,找变量的过程这里就省略了。从 11 栈开始排查,看一下 partition 和 offset 信息是否存储在于某些变量中。
函数 on_tuple 的入参 input的类型为 std::shared_ptr<vs::common::OperatorMessage>,其中类型为 std::shared_ptr<RdKafka::Message> 变量 _kafka_message 存储了 Kafka 消息的原始信息。这两变量实际均为 std::shared_ptr类型,可以通过打印它的 _M_ptr 来查看内层指针的值,在前面加上 * 来查看指针指向的变量的值:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 (gdb) set print pretty on (gdb) print input $1 = { <std::__shared_ptr<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2>> = { <std::__shared_ptr_access<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>}, members of std::__shared_ptr<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2>: _M_ptr = 0xe3a6490, _M_refcount = { _M_pi = 0xe3a6480 } }, <No data fields>} (gdb) print input._M_ptr $2 = (std::__shared_ptr<vs::common::OperatorMessage, (__gnu_cxx::_Lock_policy)2>::element_type *) 0xe3a6490 (gdb) print *input._M_ptr $3 = { _vptr.OperatorMessage = 0x1dd08e8 <vtable for vs::common::OperatorMessage+16>, _kafka_message = { <std::__shared_ptr<RdKafka::Message, (__gnu_cxx::_Lock_policy)2>> = { <std::__shared_ptr_access<RdKafka::Message, (__gnu_cxx::_Lock_policy)2, false, false>> = {<No data fields>}, members of std::__shared_ptr<RdKafka::Message, (__gnu_cxx::_Lock_policy)2>: _M_ptr = 0x8af8c80, _M_refcount = { _M_pi = 0x1ae1e400 } }, <No data fields>}, // ..... } (gdb) print input._M_ptr->_kafka_message._M_ptr $5 = (std::__shared_ptr<RdKafka::Message, (__gnu_cxx::_Lock_policy)2>::element_type *) 0x8af8c80 (gdb) print *input._M_ptr->_kafka_message._M_ptr $6 = { _vptr.Message = 0x1df0258 <vtable for RdKafka::MessageImpl+16> }
print 动态类型 但不幸的是,由于 RdKafka::Message是个动态类型,实际的实例是 RdKafka::MessageImpl,所以直接打印看不到其中的成员,此时可以设置 set print object on 强制打印变量真实类型(dynamic_cast类型强转后,最终打印出的值不太对,没细纠是啥原因):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 (gdb) set print object on (gdb) print *input._M_ptr->_kafka_message._M_ptr $12 = (RdKafka::MessageImpl) { <RdKafka::Message> = { _vptr.Message = 0x1df0258 <vtable for RdKafka::MessageImpl+16> }, members of RdKafka::MessageImpl: topic_ = 0x58b8a60, rkmessage_ = 0xc774e38, free_rkmessage_ = true, rkmessage_err_ = { err = RD_KAFKA_RESP_ERR_OFFSET_OUT_OF_RANGE, rkt = 0x5f6164626d616c5f, partition = 1986290284, payload = 0x6863735f7374726f, len = 3472943151872042085, key = 0x3131393238303532, key_len = 7234014019578574644, offset = 8390891584507309673, _private = 0x7564656863735f73 }, key_ = 0x15240dd8, headers_ = 0x0, rk_type_ = RD_KAFKA_CONSUMER }
翻了一下 librdkafka 的源码,partition 和 offset 的信息是存储在变量 rkmessage_ 中的,需要获取到它的值。但是由于 _kafka_message 实际的类型为 RdKafka::Message ,其中不包含变量 rkmessage_,所以没法直接用 print 来打印
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 (gdb) print *(input._M_ptr->_kafka_message._M_ptr).rkmessage_ There is no member or method named rkmessage_. (gdb) ptype *(input._M_ptr._kafka_message._M_ptr) type = class RdKafka::Message { public: virtual std::string errstr(void) const; virtual RdKafka::ErrorCode err(void) const; virtual RdKafka::Topic * topic(void) const; virtual std::string topic_name(void) const; virtual int32_t partition(void) const; virtual void * payload(void) const; virtual size_t len(void) const; virtual const std::string * key(void) const; virtual const void * key_pointer(void) const; virtual size_t key_len(void) const; virtual int64_t offset(void) const; virtual RdKafka::MessageTimestamp timestamp(void) const; virtual void * msg_opaque(void) const; ~Message(); virtual int64_t latency(void) const; virtual rd_kafka_message_s * c_ptr(void); virtual RdKafka::Message::Status status(void) const; virtual RdKafka::Headers * headers(void); virtual RdKafka::Headers * headers(RdKafka::ErrorCode *); virtual int32_t broker_id(void) const; }
可以基于指针地址进行打印,根据源码,变量 rkmessage_ 的类型为 rd_kafka_message_t*,地址为 0xc774e38(见 print *input._M_ptr->_kafka_message._M_ptr的打印结果),那么可以这样 print,可以看到其中的 partition 和 offset 值是正常的,最后也顺利根据这个信息拿到了问题数据并进行了本地问题复现:
1 2 3 4 5 6 7 8 9 10 11 12 (gdb) print *(rd_kafka_message_t *)0xc774e38 $16 = { err = RD_KAFKA_RESP_ERR_NO_ERROR, rkt = 0x55821c0, partition = 1, payload = 0x77c5044, len = 1851, key = 0x77c5006, key_len = 60, offset = 76573191, _private = 0xc774dc0 }
进一步探索 对于其他变量也是同理,例如根据源码, key_ 的类型为 std::string *,那么也可以通过类似的方式查看对应的值:
1 2 3 4 5 6 7 8 9 10 (gdb) print *(std::string *)0x15240dd8 $17 = { static npos = 18446744073709551615, _M_dataplus = { <std::allocator<char>> = { <std::__new_allocator<char>> = {<No data fields>}, <No data fields>}, members of std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider: _M_p = 0x1a667f18 "http://kg.baidu.com/rawbase/012226d6ed5807455d7e402b801a31cd" } }
甚至可以直接用 ptype 打印其包含的成员:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 (gdb) ptype *(RdKafka::MessageImpl *)0x8af8c80 type = class RdKafka::MessageImpl : public RdKafka::Message { public: RdKafka::Topic *topic_; rd_kafka_message_t *rkmessage_; bool free_rkmessage_; rd_kafka_message_t rkmessage_err_; std::string *key_; private: RdKafka::Headers *headers_; const rd_kafka_type_t rk_type_; public: ~MessageImpl(); MessageImpl(rd_kafka_type_t, RdKafka::Topic *, rd_kafka_message_t *); MessageImpl(rd_kafka_type_t, RdKafka::Topic *, rd_kafka_message_t *, bool); MessageImpl(rd_kafka_type_t, rd_kafka_message_t *); MessageImpl(rd_kafka_type_t, RdKafka::Topic *, RdKafka::ErrorCode); private: MessageImpl(const RdKafka::MessageImpl &); public: virtual std::string errstr(void) const; virtual RdKafka::ErrorCode err(void) const; virtual RdKafka::Topic * topic(void) const; virtual std::string topic_name(void) const; virtual int32_t partition(void) const; virtual void * payload(void) const; virtual size_t len(void) const; virtual const std::string * key(void) const; virtual const void * key_pointer(void) const; --Type <RET> for more, q to quit, c to continue without paging-- virtual size_t key_len(void) const; virtual int64_t offset(void) const; virtual RdKafka::MessageTimestamp timestamp(void) const; virtual void * msg_opaque(void) const; virtual int64_t latency(void) const; virtual rd_kafka_message_s * c_ptr(void); virtual RdKafka::Message::Status status(void) const; virtual RdKafka::Headers * headers(void); virtual RdKafka::Headers * headers(RdKafka::ErrorCode *); virtual int32_t broker_id(void) const; private: RdKafka::MessageImpl & operator=(const RdKafka::MessageImpl &); }
但是相同的地址,用 std::string 类型打印,虽然没有报错,但是内容就不对了
1 2 3 4 5 6 7 8 9 10 (gdb) print *(std::string *)0x8af8c80 $18 = { static npos = 18446744073709551615, _M_dataplus = { <std::allocator<char>> = { <std::__new_allocator<char>> = {<No data fields>}, <No data fields>}, members of std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider: _M_p = 0x1df0258 <vtable for RdKafka::MessageImpl+16> "\200e`" } }
给人的感觉就是,给一个指针地址,然后用指定的类型来匹配地址中存储的数据内容,有点粗暴,hh