thrift 序列化字段读写的一个小坑

前段时间跟同事一块联调某系统时,client发送thrift序列化后的数据,本地打log能正确读到该字段,而server却收不到该字段的值,感到比较诡异,通过修改下读写的方法就ok了,花了一点时间踩了一个小坑,先从业务代码片段,再到源码分析,最后再与protobuf作对应点的简单比较,与大家分享下。

先看下thrift IDL在序列化读写时的用法,以某业务中简化版为例,定义及两种读写方式如下:

struct UrlItem {
	1: required string url;
	2: optional string referer;
}

//第一种用法
UrlItem url_item;
url_item.url = "http://47.110.236.62/";
url_item.referer = "https://www.google.com.hk/";

log(LOG_NOTICE, "url:%s\treferer:%s\n", 
	url_item.url.c_str(), 
	url_item.referer.c_str());

//第二种用法
url_item.__set_url("http://47.110.236.62/");
url_item.__set_referer("https://www.google.com.hk/");

log(LOG_NOTICE, "url:%s\treferer:%s\n", 
	url_item.url.c_str(), 
	url_item.referer.c_str()); 

以上省略掉网络发送的过程,从log上看,两种用法本地log均符合预期,但是第一种用法server端却没有收到url_item对应的referer数据,第二种用法却正常,UrlItem实际是一个成员均为public的类,对public成员直接点调用与thrift生成的set方法调用看来是有差异,具体在哪呢?让我们从源码来看:

typedef struct _UrlItem__isset {
  _UrlItem__isset() : referer(false) {}
  bool referer;
} _UrlItem__isset;

class UrlItem {
 public:

  static const char* ascii_fingerprint; // = "5B708A954C550ECA9C1A49D3C5CAFAB9";
  static const uint8_t binary_fingerprint[16]; // = {0x5B,0x70,0x8A,0x95,0x4C,0x55,0x0E,0xCA,0x9C,0x1A,0x49,0xD3,0xC5,0xCA,0xFA,0xB9};

  UrlItem() : url(""), referer("") {
  }

  virtual ~UrlItem() throw() {}

  std::string url;
  std::string referer;

  _UrlItem__isset __isset;

  void __set_url(const std::string& val) {
    url = val;
  }

  void __set_referer(const std::string& val) {
    referer = val;
    __isset.referer = true;
  }

从上面thirft对IDL生成的代码可以看到,用了一个额外的结构体_UrlItem__isset用来标记optional成员的写(赋值)状态,使用__set_XX方法会对__isset对应字段置true,直接点成员赋值却没有该作用。那这样对网络发送有神马作用呢?让我们继续看序列化读写源码:

uint32_t UrlItem::read(::apache::thrift::protocol::TProtocol* iprot) {

  uint32_t xfer = 0;
  std::string fname;
  ::apache::thrift::protocol::TType ftype;
  int16_t fid;

  xfer += iprot->readStructBegin(fname);

  using ::apache::thrift::protocol::TProtocolException;

  bool isset_url = false;

  while (true)
  {
    xfer += iprot->readFieldBegin(fname, ftype, fid);
    if (ftype == ::apache::thrift::protocol::T_STOP) {
      break;
    }
    switch (fid)
    {
      case 1:
        if (ftype == ::apache::thrift::protocol::T_STRING) {
          xfer += iprot->readString(this->url);
          isset_url = true;
        } else {
          xfer += iprot->skip(ftype);
        }
        break;
      case 2:
        if (ftype == ::apache::thrift::protocol::T_STRING) {
          xfer += iprot->readString(this->referer);
          this->__isset.referer = true;
        } else {
          xfer += iprot->skip(ftype);
        }
        break;
      default:
        xfer += iprot->skip(ftype);
        break;
    }
    xfer += iprot->readFieldEnd();
  }

  xfer += iprot->readStructEnd();

  if (!isset_url)
    throw TProtocolException(TProtocolException::INVALID_DATA);
  return xfer;
}

uint32_t UrlItem::write(::apache::thrift::protocol::TProtocol* oprot) const {
  uint32_t xfer = 0;
  xfer += oprot->writeStructBegin("UrlItem");
  xfer += oprot->writeFieldBegin("url", ::apache::thrift::protocol::T_STRING, 1);
  xfer += oprot->writeString(this->url);
  xfer += oprot->writeFieldEnd();
  if (this->__isset.referer) {
    xfer += oprot->writeFieldBegin("referer", ::apache::thrift::protocol::T_STRING, 2);
    xfer += oprot->writeString(this->referer);
    xfer += oprot->writeFieldEnd();
  }
  xfer += oprot->writeFieldStop();
  xfer += oprot->writeStructEnd();
  return xfer;
} 

显然,序列化写操作会有对optional成员的写标记进行判断,当通过网络对外发送时会调用write方法,当__isset对应成员没有置写标记为true时该字段也就不会对外发送了,而对于required成员必选字段则不会有此问题,哼,问题就在这里,这样做有神马好处呢?这样能减少网络发送的开销,对于未使用的optional可选字段就没有必要发送了,thrift IDL的实现这里使用结构体对于每个可选成员都用bool变量来标记,这里还是相对比较浪费的,一个字段一个字节呢;而对于反序列化进行读操作会通过字段类型和字段ID去判断,无需担心。因此,当写optional字段时一定要采用__set_XX成员方法的方式!
google的protobuf又是怎么处理的呢?笔者把相关代码片断放一块了:

message UrlItem {
	required string url = 1;
	optional string referer = 2;
}

class UrlItem : public ::google::protobuf::Message {
 public:
  UrlItem();
  virtual ~UrlItem();
  
  UrlItem(const UrlItem& from);

  // required string url = 1;
  inline bool has_url() const;
  inline void clear_url();
  static const int kUrlFieldNumber = 1;
  inline const ::std::string& url() const;
  inline void set_url(const ::std::string& value);
  inline void set_url(const char* value);
  inline void set_url(const char* value, size_t size);
  inline ::std::string* mutable_url();
  inline ::std::string* release_url();
  
  // optional string referer = 2;
  inline bool has_referer() const;
  inline void clear_referer();
  static const int kRefererFieldNumber = 2;
  inline const ::std::string& referer() const;
  inline void set_referer(const ::std::string& value);
  inline void set_referer(const char* value);
  inline void set_referer(const char* value, size_t size);
  inline ::std::string* mutable_referer();
  inline ::std::string* release_referer();

 private:
  inline void set_has_url();
  inline void clear_has_url();
  inline void set_has_referer();
  inline void clear_has_referer();
  
  ::google::protobuf::UnknownFieldSet _unknown_fields_;
  
  ::std::string* url_;
  ::std::string* referer_;
  
  mutable int _cached_size_;
  ::google::protobuf::uint32 _has_bits_[(2 + 31) / 32];
};

// required string url = 1;
inline bool UrlItem::has_url() const {
  return (_has_bits_[0] & 0x00000001u) != 0;
}
inline void UrlItem::set_has_url() {
  _has_bits_[0] |= 0x00000001u;
}
inline void UrlItem::clear_has_url() {
  _has_bits_[0] &= ~0x00000001u;
}
inline void UrlItem::clear_url() {
  if (url_ != &::google::protobuf::internal::kEmptyString) {
    url_->clear();
  }
  clear_has_url();
}
inline const ::std::string& UrlItem::url() const {
  return *url_;
}
inline void UrlItem::set_url(const ::std::string& value) {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  url_->assign(value);
}
inline void UrlItem::set_url(const char* value) {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  url_->assign(value);
}
inline void UrlItem::set_url(const char* value, size_t size) {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  url_->assign(reinterpret_cast(value), size);
}
inline ::std::string* UrlItem::mutable_url() {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  return url_;
}
inline ::std::string* UrlItem::release_url() {
  clear_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    return NULL;
  } else {
    ::std::string* temp = url_;
    url_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
    return temp;
  }
}

// optional string referer = 2;
inline bool UrlItem::has_referer() const {
  return (_has_bits_[0] & 0x00000002u) != 0;
}
inline void UrlItem::set_has_referer() {
  _has_bits_[0] |= 0x00000002u;
}
inline void UrlItem::clear_has_referer() {
  _has_bits_[0] &= ~0x00000002u;
}
inline void UrlItem::clear_referer() {
  if (referer_ != &::google::protobuf::internal::kEmptyString) {
    referer_->clear();
  }
  clear_has_referer();
}
inline const ::std::string& UrlItem::referer() const {
  return *referer_;
}
inline void UrlItem::set_referer(const ::std::string& value) {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  referer_->assign(value);
}
inline void UrlItem::set_referer(const char* value) {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  referer_->assign(value);
}
inline void UrlItem::set_referer(const char* value, size_t size) {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  referer_->assign(reinterpret_cast(value), size);
}
inline ::std::string* UrlItem::mutable_referer() {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  return referer_;
}
inline ::std::string* UrlItem::release_referer() {
  clear_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    return NULL;
  } else {
    ::std::string* temp = referer_;
    referer_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
    return temp;
  }
}

google protobuf的实现这里用到标志位比特_has_bits_,相对thrift省却不少空间哦,而且使用了varint可变字节压缩编码,当数据值相对较小时是很节省的(虽然thrift也是用了基于可变字节编码的zigzag,对于负数值更省空间),再者protobuf序列化反序列化等实现上也更高效(比较下与thrift的代码就可看出),如果我有选型的决定权,肯定推崇google protobuf啦~

thrift 序列化字段读写的一个小坑》有1个想法

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注