I ran into the same issue in both C++ and Python.
For the C++ version, I used a mix of the code Kenton Varda posted on this thread and the code from the pull request he sent to the protobuf team (because the version posted here doesn't handle EOF while the one he sent to github does).
#include <google/protobuf/message_lite.h>
#include <google/protobuf/io/zero_copy_stream.h>
#include <google/protobuf/io/coded_stream.h>
bool writeDelimitedTo(const google::protobuf::MessageLite& message,
google::protobuf::io::ZeroCopyOutputStream* rawOutput)
{
// We create a new coded stream for each message. Don't worry, this is fast.
google::protobuf::io::CodedOutputStream output(rawOutput);
// Write the size.
const int size = message.ByteSize();
output.WriteVarint32(size);
uint8_t* buffer = output.GetDirectBufferForNBytesAndAdvance(size);
if (buffer != NULL)
{
// Optimization: The message fits in one buffer, so use the faster
// direct-to-array serialization path.
message.SerializeWithCachedSizesToArray(buffer);
}
else
{
// Slightly-slower path when the message is multiple buffers.
message.SerializeWithCachedSizes(&output);
if (output.HadError())
return false;
}
return true;
}
bool readDelimitedFrom(google::protobuf::io::ZeroCopyInputStream* rawInput, google::protobuf::MessageLite* message, bool* clean_eof)
{
// We create a new coded stream for each message. Don't worry, this is fast,
// and it makes sure the 64MB total size limit is imposed per-message rather
// than on the whole stream. (See the CodedInputStream interface for more
// info on this limit.)
google::protobuf::io::CodedInputStream input(rawInput);
const int start = input.CurrentPosition();
if (clean_eof)
*clean_eof = false;
// Read the size.
uint32_t size;
if (!input.ReadVarint32(&size))
{
if (clean_eof)
*clean_eof = input.CurrentPosition() == start;
return false;
}
// Tell the stream not to read beyond that size.
google::protobuf::io::CodedInputStream::Limit limit = input.PushLimit(size);
// Parse the message.
if (!message->MergeFromCodedStream(&input)) return false;
if (!input.ConsumedEntireMessage()) return false;
// Release the limit.
input.PopLimit(limit);
return true;
}
And here is my python2 implementation:
from google.protobuf.internal import encoder
from google.protobuf.internal import decoder
#I had to implement this because the tools in google.protobuf.internal.decoder
#read from a buffer, not from a file-like objcet
def readRawVarint32(stream):
mask = 0x80 # (1 << 7)
raw_varint32 = []
while 1:
b = stream.read(1)
#eof
if b == "":
break
raw_varint32.append(b)
if not (ord(b) & mask):
#we found a byte starting with a 0, which means it's the last byte of this varint
break
return raw_varint32
def writeDelimitedTo(message, stream):
message_str = message.SerializeToString()
delimiter = encoder._VarintBytes(len(message_str))
stream.write(delimiter + message_str)
def readDelimitedFrom(MessageType, stream):
raw_varint32 = readRawVarint32(stream)
message = None
if raw_varint32:
size, _ = decoder._DecodeVarint32(raw_varint32, 0)
data = stream.read(size)
if len(data) < size:
raise Exception("Unexpected end of file")
message = MessageType()
message.ParseFromString(data)
return message
#In place version that takes an already built protobuf object
#In my tests, this is around 20% faster than the other version
#of readDelimitedFrom()
def readDelimitedFrom_inplace(message, stream):
raw_varint32 = readRawVarint32(stream)
if raw_varint32:
size, _ = decoder._DecodeVarint32(raw_varint32, 0)
data = stream.read(size)
if len(data) < size:
raise Exception("Unexpected end of file")
message.ParseFromString(data)
return message
else:
return None
It might not be the best looking code and I'm sure it can be refactored a fair bit, but at least that should show you one way to do it.
Now the big problem: It's SLOW.
Even when using the C++ implementation of python-protobuf, it's one order of magnitude slower than in pure C++. I have a benchmark where I read 10M protobuf messages of ~30 bytes each from a file. It takes ~0.9s in C++, and 35s in python.
One way to make it a bit faster would be to re-implement the varint decoder to make it read from a file and decode in one go, instead of reading from a file and then decoding as this code currently does. (profiling shows that a significant amount of time is spent in the varint encoder/decoder). But needless to say that alone is not enough to close the gap between the python version and the C++ version.
Any idea to make it faster is very welcome :)