peek() now uses the UTF-8 buffer.

This commit is contained in:
Ferdinand Majerech 2014-07-29 02:56:23 +02:00
parent ef9053d7f3
commit 56057b43ec

View file

@ -77,6 +77,18 @@ final class Reader
Endian endian_;
}
// Index to buffer8_ where the last decoded character starts.
size_t lastDecodedBufferOffset_ = 0;
// Offset, relative to charIndex_, of the last decoded character,
// in code points, not chars.
size_t lastDecodedCharOffset_ = 0;
// Number of character decodings done during the life of the Reader.
//
// Used for performance testing.
size_t decodeCount_ = 0;
public:
import std.stream;
/// Construct a Reader.
@ -134,9 +146,9 @@ final class Reader
///
// XXX removed; search for 'risky' to find why.
// Throws: ReaderException if trying to read past the end of the buffer.
dchar peek(size_t index = 0) @safe pure nothrow const @nogc
dchar peek(size_t index = 0) @safe pure nothrow @nogc
{
if(buffer_.length <= bufferOffset_ + index)
if(buffer_.length <= charIndex_ + index)
{
// XXX This is risky; revert this and the 'risky' change in UTF decoder
// if any bugs are introduced. We rely on the assumption that Reader
@ -145,7 +157,34 @@ final class Reader
return '\0';
}
return buffer_[bufferOffset_ + index];
// Optimized path for Scanner code that peeks chars in linear order to
// determine the length of some sequence.
if(index == lastDecodedCharOffset_)
{
++decodeCount_;
++lastDecodedCharOffset_;
const char b = buffer8_[lastDecodedBufferOffset_];
// ASCII
if(b < 0x80)
{
++lastDecodedBufferOffset_;
return b;
}
return decodeValidUTF8NoGC(buffer8_, lastDecodedBufferOffset_);
}
// 'Slow' path where we decode everything up to the requested character.
lastDecodedCharOffset_ = 0;
lastDecodedBufferOffset_ = bufferOffset8_;
dchar d;
while(lastDecodedCharOffset_ <= index)
{
d = decodeNext();
}
return d;
}
/// Get specified number of characters starting at current position.
@ -245,6 +284,28 @@ final class Reader
/// Get encoding of the input buffer.
final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }
private:
// Decode the next character relative to
// lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
//
// Does not advance the buffer position. Used in peek() and slice().
dchar decodeNext() @safe pure nothrow @nogc
{
assert(lastDecodedBufferOffset_ < buffer8_.length,
"Attempted to decode past the end of a string");
++decodeCount_;
const char b = buffer8_[lastDecodedBufferOffset_];
++lastDecodedCharOffset_;
// ASCII
if(b < 0x80)
{
++lastDecodedBufferOffset_;
return b;
}
return decodeValidUTF8NoGC(buffer8_, lastDecodedBufferOffset_);
}
// Decode the character starting at bufferOffset8_ and move to the next
// character.
//