peek() now uses the UTF-8 buffer.

This commit is contained in:
Ferdinand Majerech 2014-07-29 02:56:23 +02:00
parent ef9053d7f3
commit 56057b43ec

View file

@ -77,6 +77,18 @@ final class Reader
Endian endian_; Endian endian_;
} }
// Index to buffer8_ where the last decoded character starts.
size_t lastDecodedBufferOffset_ = 0;
// Offset, relative to charIndex_, of the last decoded character,
// in code points, not chars.
size_t lastDecodedCharOffset_ = 0;
// Number of character decodings done during the life of the Reader.
//
// Used for performance testing.
size_t decodeCount_ = 0;
public: public:
import std.stream; import std.stream;
/// Construct a Reader. /// Construct a Reader.
@ -134,9 +146,9 @@ final class Reader
/// ///
// XXX removed; search for 'risky' to find why. // XXX removed; search for 'risky' to find why.
// Throws: ReaderException if trying to read past the end of the buffer. // Throws: ReaderException if trying to read past the end of the buffer.
dchar peek(size_t index = 0) @safe pure nothrow const @nogc dchar peek(size_t index = 0) @safe pure nothrow @nogc
{ {
if(buffer_.length <= bufferOffset_ + index) if(buffer_.length <= charIndex_ + index)
{ {
// XXX This is risky; revert this and the 'risky' change in UTF decoder // XXX This is risky; revert this and the 'risky' change in UTF decoder
// if any bugs are introduced. We rely on the assumption that Reader // if any bugs are introduced. We rely on the assumption that Reader
@ -145,7 +157,34 @@ final class Reader
return '\0'; return '\0';
} }
return buffer_[bufferOffset_ + index]; // Optimized path for Scanner code that peeks chars in linear order to
// determine the length of some sequence.
if(index == lastDecodedCharOffset_)
{
++decodeCount_;
++lastDecodedCharOffset_;
const char b = buffer8_[lastDecodedBufferOffset_];
// ASCII
if(b < 0x80)
{
++lastDecodedBufferOffset_;
return b;
}
return decodeValidUTF8NoGC(buffer8_, lastDecodedBufferOffset_);
}
// 'Slow' path where we decode everything up to the requested character.
lastDecodedCharOffset_ = 0;
lastDecodedBufferOffset_ = bufferOffset8_;
dchar d;
while(lastDecodedCharOffset_ <= index)
{
d = decodeNext();
}
return d;
} }
/// Get specified number of characters starting at current position. /// Get specified number of characters starting at current position.
@ -245,6 +284,28 @@ final class Reader
/// Get encoding of the input buffer. /// Get encoding of the input buffer.
final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; } final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }
private:
// Decode the next character relative to
// lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
//
// Does not advance the buffer position. Used in peek() and slice().
dchar decodeNext() @safe pure nothrow @nogc
{
assert(lastDecodedBufferOffset_ < buffer8_.length,
"Attempted to decode past the end of a string");
++decodeCount_;
const char b = buffer8_[lastDecodedBufferOffset_];
++lastDecodedCharOffset_;
// ASCII
if(b < 0x80)
{
++lastDecodedBufferOffset_;
return b;
}
return decodeValidUTF8NoGC(buffer8_, lastDecodedBufferOffset_);
}
// Decode the character starting at bufferOffset8_ and move to the next // Decode the character starting at bufferOffset8_ and move to the next
// character. // character.
// //