scanTagURI now scans to a slice, as does scanURIEscapes, at cost of complexity

2014-07-25 02:34:53 +02:00 · 2014-07-25 02:34:53 +02:00 · 817dc3b610
commit 817dc3b610
parent 457cabbb72
1 changed files with 83 additions and 54 deletions
--- a/source/dyaml/scanner.d
+++ b/source/dyaml/scanner.d
@ -992,12 +992,12 @@ final class Scanner
        }

        /// Scan prefix of a tag directive.
-        dchar[] scanTagDirectivePrefix(const Mark startMark) @safe pure
+        dstring scanTagDirectivePrefix(const Mark startMark) @safe pure
        {
            auto value = scanTagURI("directive", startMark);
            enforce(" \0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek()),
                    new Error("While scanning a directive prefix", startMark,
-                              "expected ' ', but found" ~ to!string(reader_.peek()),
+                              "expected ' ', but found" ~ reader_.peek().to!string,
                              reader_.mark));

            return value;
@ -1055,12 +1055,12 @@ final class Scanner
        }

        /// Scan a tag token.
-        Token scanTag() @safe pure
+        Token scanTag() @trusted pure
        {
            const startMark = reader_.mark;
            dchar c = reader_.peek(1);
-            dchar[] handle;
-            dchar[] suffix;
+            dstring handle;
+            dstring suffix;

            if(c == '<')
            {
@ -1699,48 +1699,80 @@ final class Scanner
        }

        /// Scan URI in a tag token.
-        dchar[] scanTagURI(const string name, const Mark startMark) @trusted pure
+        dstring scanTagURI(const string name, const Mark startMark) @trusted pure
        {
            // Note: we do not check if URI is well-formed.
-            // Using appender_, so clear it when we're done.
-            scope(exit) { appender_.clear(); }
-            uint length = 0;

+            reader_.sliceBuilder.begin();
            dchar c = reader_.peek();
+            {
+                scope(failure) { reader_.sliceBuilder.finish(); }
+                uint length = 0;
                while(isAlphaNum(c) || "-;/?:@&=+$,_.!~*\'()[]%"d.canFind(c))
                {
                    if(c == '%')
                    {
-                    appender_.put(reader_.get(length));
+                        auto chars = reader_.get(length);
+                        reader_.sliceBuilder.write(chars);
                        length = 0;
-                    appender_.put(scanURIEscapes(name, startMark));
+                        scanURIEscapesToSlice(name, startMark);
                    }
                    else { ++length; }
                    c = reader_.peek(length);
                }
                if(length > 0)
                {
-                appender_.put(reader_.get(length));
+                    auto chars = reader_.get(length);
+                    reader_.sliceBuilder.write(chars);
                    length = 0;
                }
-            enforce(appender_.data.length > 0,
+            }
+            dstring result = reader_.sliceBuilder.finish();
+            enforce(!result.empty,
                    new Error("While parsing a " ~ name, startMark,
                              "expected URI, but found: " ~ c.to!string, reader_.mark));

-            return appender_.data;
+            return result;
        }

        /// Scan URI escape sequences.
-        dchar[] scanURIEscapes(const string name, const Mark startMark) @system pure
+        void scanURIEscapesToSlice(const string name, const Mark startMark) @system pure
        {
-            ubyte[] bytes;
+            // URI escapes encode a UTF-8 string. We store UTF-8 code units here for
+            // decoding into UTF-32.
+            char[4] bytes;
+            size_t bytesUsed;
            Mark mark = reader_.mark;

+            // Get one dchar by decoding data from bytes.
+            //
+            // This is probably slow, but simple and URI escapes are extremely uncommon
+            // in YAML.
+            static size_t getDchar(char[] bytes, Reader reader_)
+            {
+                import std.utf;
+                size_t nextChar;
+                const c = std.utf.decode(bytes[], nextChar); 
+                reader_.sliceBuilder.write(c);
+                if(bytes.length - nextChar > 0)
+                {
+                    core.stdc.string.memmove(bytes.ptr, bytes.ptr + nextChar, 
+                                             bytes.length - nextChar);
+                }
+                return bytes.length - nextChar;
+            }
+
+            try 
+            {
                while(reader_.peek() == '%')
                {
                    reader_.forward();
+                    if(bytesUsed == bytes.length)
+                    {
+                        bytesUsed = getDchar(bytes[], reader_); 
+                    }

-                ubyte b = 0;
+                    char b = 0;
                    uint mult = 16;
                    // Converting 2 hexadecimal digits to a byte.
                    foreach(k; 0 .. 2)
@ -1760,15 +1792,12 @@ final class Scanner
                        b += mult * digit;
                        mult /= 16;
                    }
-                bytes ~= b;
+                    bytes[bytesUsed++] = b;

                    reader_.forward(2);
                }

-            try { return to!(dchar[])(cast(string)bytes); }
-            catch(ConvException e)
-            {
-                throw new Error("While scanning a " ~ name, startMark, e.msg, mark);
+                bytesUsed = getDchar(bytes[0 .. bytesUsed], reader_);
            }
            catch(UTFException e)
            {