peek() now uses the UTF-8 buffer.

2014-07-29 02:56:23 +02:00 · 2014-07-29 02:56:23 +02:00 · 56057b43ec
commit 56057b43ec
parent ef9053d7f3
1 changed files with 64 additions and 3 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -77,6 +77,18 @@ final class Reader
            Endian endian_;
        }

+        // Index to buffer8_ where the last decoded character starts.
+        size_t lastDecodedBufferOffset_ = 0;
+        // Offset, relative to charIndex_, of the last decoded character, 
+        // in code points, not chars.
+        size_t lastDecodedCharOffset_ = 0;
+
+        // Number of character decodings done during the life of the Reader.
+        //
+        // Used for performance testing.
+        size_t decodeCount_ = 0;
+
+
    public:
        import std.stream;
        /// Construct a Reader.
@ -134,9 +146,9 @@ final class Reader
        ///
        // XXX removed; search for 'risky' to find why.
        // Throws:  ReaderException if trying to read past the end of the buffer.
-        dchar peek(size_t index = 0) @safe pure nothrow const @nogc
+        dchar peek(size_t index = 0) @safe pure nothrow @nogc
        {
-            if(buffer_.length <= bufferOffset_ + index)
+            if(buffer_.length <= charIndex_ + index)
            {
                // XXX This is risky; revert this and the 'risky' change in UTF decoder
                // if any bugs are introduced. We rely on the assumption that Reader
@ -145,7 +157,34 @@ final class Reader
                return '\0';
            }

-            return buffer_[bufferOffset_ + index];
+            // Optimized path for Scanner code that peeks chars in linear order to
+            // determine the length of some sequence.
+            if(index == lastDecodedCharOffset_)
+            {
+
+                ++decodeCount_;
+                ++lastDecodedCharOffset_;
+                const char b = buffer8_[lastDecodedBufferOffset_];
+                // ASCII
+                if(b < 0x80) 
+                {
+                    ++lastDecodedBufferOffset_;
+                    return b;
+                }
+                return decodeValidUTF8NoGC(buffer8_, lastDecodedBufferOffset_);
+            }
+
+
+            // 'Slow' path where we decode everything up to the requested character.
+            lastDecodedCharOffset_   = 0;
+            lastDecodedBufferOffset_ = bufferOffset8_;
+            dchar d;
+            while(lastDecodedCharOffset_ <= index)
+            {
+                d = decodeNext();
+            }
+
+            return d;
        }

        /// Get specified number of characters starting at current position.
@ -245,6 +284,28 @@ final class Reader
        /// Get encoding of the input buffer.
        final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }

+private:
+        // Decode the next character relative to
+        // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
+        //
+        // Does not advance the buffer position. Used in peek() and slice().
+        dchar decodeNext() @safe pure nothrow @nogc 
+        {
+            assert(lastDecodedBufferOffset_ < buffer8_.length,
+                   "Attempted to decode past the end of a string");
+            ++decodeCount_;
+            const char b = buffer8_[lastDecodedBufferOffset_];
+            ++lastDecodedCharOffset_;
+            // ASCII
+            if(b < 0x80)
+            {
+                ++lastDecodedBufferOffset_;
+                return b;
+            }
+
+            return decodeValidUTF8NoGC(buffer8_, lastDecodedBufferOffset_);
+        }
+
        // Decode the character starting at bufferOffset8_ and move to the next
        // character.
        //