From 952726aa5eacba150bdb796dfcf1181fe6cc1bc1 Mon Sep 17 00:00:00 2001 From: Ferdinand Majerech Date: Tue, 29 Jul 2014 03:18:37 +0200 Subject: [PATCH] UTF-8 scanFlowScalar. **NOTE:** moved escaping to Parser; can't do it in-place --- source/dyaml/parser.d | 87 +++++++++++++++++++++++++++++++++++++++++- source/dyaml/scanner.d | 83 +++++++++++++++++++++++----------------- 2 files changed, 133 insertions(+), 37 deletions(-) diff --git a/source/dyaml/parser.d b/source/dyaml/parser.d index d778b2a..bd5f8f3 100644 --- a/source/dyaml/parser.d +++ b/source/dyaml/parser.d @@ -11,6 +11,7 @@ module dyaml.parser; +import std.algorithm; import std.array; import std.container; import std.conv; @@ -411,7 +412,8 @@ final class Parser ///Parse a node. Event parseNode(const Flag!"block" block, - const Flag!"indentlessSequence" indentlessSequence = No.indentlessSequence) @safe + const Flag!"indentlessSequence" indentlessSequence = No.indentlessSequence) + @trusted { if(scanner_.checkToken(TokenID.Alias)) { @@ -468,12 +470,15 @@ final class Parser if(scanner_.checkToken(TokenID.Scalar)) { immutable token = scanner_.getToken(); + auto value = token.style == ScalarStyle.DoubleQuoted + ? handleDoubleQuotedScalarEscapes(token.value) + : token.value; implicit = (token.style == ScalarStyle.Plain && tag is null) || tag == "!"; bool implicit_2 = (!implicit) && tag is null; state_ = popState(); return scalarEvent(startMark, token.endMark, Anchor(anchor), Tag(tag), - tuple(implicit, implicit_2), token.value, token.style); + tuple(implicit, implicit_2), value, token.style); } if(scanner_.checkToken(TokenID.FlowSequenceStart)) @@ -526,6 +531,84 @@ final class Parser ~ token.idString, token.startMark); } + /// Handle escape sequences in a double quoted scalar. + /// + /// Moved here from scanner as it can't always be done in-place with slices. + string handleDoubleQuotedScalarEscapes(string tokenValue) + { + string notInPlace; + bool inEscape = false; + import dyaml.nogcutil; + auto appender = appenderNoGC(cast(char[])tokenValue); + for(string oldValue = tokenValue; !oldValue.empty();) + { + const dchar c = oldValue.front(); + oldValue.popFront(); + + if(!inEscape) + { + if(c != '\\') + { + if(notInPlace is null) { appender.putDChar(c); } + else { notInPlace ~= c; } + continue; + } + // Escape sequence starts with a '\' + inEscape = true; + continue; + } + + import dyaml.escapes; + scope(exit) { inEscape = false; } + + // 'Normal' escape sequence. + if(dyaml.escapes.escapes.canFind(c)) + { + if(notInPlace is null) + { + // \L and \C can't be handled in place as the expand into + // many-byte unicode chars + if(c != 'L' && c != 'P') + { + appender.putDChar(dyaml.escapes.fromEscape(c)); + continue; + } + // Need to duplicate as we won't fit into + // token.value - which is what appender uses + notInPlace = appender.data.dup; + notInPlace ~= dyaml.escapes.fromEscape(c); + continue; + } + notInPlace ~= dyaml.escapes.fromEscape(c); + continue; + } + + // Unicode char written in hexadecimal in an escape sequence. + if(dyaml.escapes.escapeHexCodeList.canFind(c)) + { + // Scanner has already checked that the hex string is valid. + + const hexLength = dyaml.escapes.escapeHexLength(c); + // Any hex digits are 1-byte so this works. + string hex = oldValue[0 .. hexLength]; + oldValue = oldValue[hexLength .. $]; + assert(!hex.canFind!(d => !d.isHexDigit), + "Scanner must ensure the hex string is valid"); + + bool overflow; + const decoded = cast(dchar)parseNoGC!int(hex, 16u, overflow); + assert(!overflow, "Scanner must ensure there's no overflow"); + if(notInPlace is null) { appender.putDChar(decoded); } + else { notInPlace ~= decoded; } + continue; + } + + assert(false, "Scanner must handle unsupported escapes"); + } + + return notInPlace is null ? cast(string)appender.data : notInPlace; + } + /** * Process a tag string retrieved from a tag token. * diff --git a/source/dyaml/scanner.d b/source/dyaml/scanner.d index 0e58a3b..9adf875 100644 --- a/source/dyaml/scanner.d +++ b/source/dyaml/scanner.d @@ -1519,23 +1519,23 @@ final class Scanner const startMark = reader_.mark; const quote = reader_.get(); - reader_.sliceBuilder.begin(); - scope(exit) if(error_) { reader_.sliceBuilder.finish(); } + reader_.sliceBuilder8.begin(); + scope(exit) if(error_) { reader_.sliceBuilder8.finish(); } - scanFlowScalarNonSpacesToSlice(quotes, startMark); + scanFlowScalarNonSpacesToSlice8(quotes, startMark); if(error_) { return Token.init; } while(reader_.peek() != quote) { - scanFlowScalarSpacesToSlice(startMark); + scanFlowScalarSpacesToSlice8(startMark); if(error_) { return Token.init; } - scanFlowScalarNonSpacesToSlice(quotes, startMark); + scanFlowScalarNonSpacesToSlice8(quotes, startMark); if(error_) { return Token.init; } } reader_.forward(); - auto slice = reader_.sliceBuilder.finish(); - return scalarToken(startMark, reader_.mark, slice.utf32To8, quotes); + auto slice = reader_.sliceBuilder8.finish(); + return scalarToken(startMark, reader_.mark, slice, quotes); } /// Scan nonspace characters in a flow scalar. @@ -1544,7 +1544,7 @@ final class Scanner /// characters into that slice. /// /// In case of an error, error_ is set. Use throwIfError() to handle this. - void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) + void scanFlowScalarNonSpacesToSlice8(const ScalarStyle quotes, const Mark startMark) @system pure nothrow @nogc { for(;;) with(ScalarStyle) @@ -1558,33 +1558,35 @@ final class Scanner // while(!search.canFind(reader_.peek(length))) { ++length; } outer: for(;;) { - const slice = reader_.slice(length, length + 32); - if(slice.empty) + const char[] slice = reader_.slice8(length + 32); + if(slice.length == length) { error("While reading a flow scalar", startMark, "reached end of file", reader_.mark); return; } - foreach(ch; slice) + for(size_t i = length; i < slice.length;) { + // slice is UTF-8 - need to decode + const ch = slice[i] < 0x80 ? slice[i++] : decodeValidUTF8NoGC(slice, i); if(search.canFind(ch)) { break outer; } ++length; } } - reader_.sliceBuilder.write(reader_.get(length)); + reader_.sliceBuilder8.write(reader_.get8(length)); c = reader_.peek(); if(quotes == SingleQuoted && c == '\'' && reader_.peek(1) == '\'') { reader_.forward(2); - reader_.sliceBuilder.write('\''); + reader_.sliceBuilder8.write('\''); } else if((quotes == DoubleQuoted && c == '\'') || (quotes == SingleQuoted && "\"\\"d.canFind(c))) { reader_.forward(); - reader_.sliceBuilder.write(c); + reader_.sliceBuilder8.write(c); } else if(quotes == DoubleQuoted && c == '\\') { @@ -1593,24 +1595,35 @@ final class Scanner if(dyaml.escapes.escapes.canFind(c)) { reader_.forward(); - reader_.sliceBuilder.write(dyaml.escapes.fromEscape(c)); + // Escaping has been moved to Parser as it can't be done in + // place (in a slice) in case of '\P' and '\L' (very uncommon, + // but we don't want to break the spec) + char[2] escapeSequence = ['\\', cast(char)c]; + reader_.sliceBuilder8.write(escapeSequence); } else if(dyaml.escapes.escapeHexCodeList.canFind(c)) { const hexLength = dyaml.escapes.escapeHexLength(c); reader_.forward(); - foreach(i; 0 .. hexLength) if(!reader_.peek(i).isHexDigit()) + foreach(i; 0 .. hexLength) if(!reader_.peek(i).isHexDigit) { error("While scanning a double quoted scalar", startMark, expected("escape sequence of hexadecimal numbers", reader_.peek(i)), reader_.mark); return; } - - dchar[] hex = reader_.get(hexLength); + char[] hex = reader_.get8(hexLength); + char[2] escapeStart = ['\\', cast(char) c]; + reader_.sliceBuilder8.write(escapeStart); + reader_.sliceBuilder8.write(hex); bool overflow; - const decoded = cast(dchar)parseNoGC!int(hex, 16u, overflow); + // Note: This is just error checking; Parser does the actual + // escaping (otherwise we could accidentally create an + // escape sequence here that wasn't in input, breaking the + // escaping code in parser, which is in parser because it + // can't always be done in place) + parseNoGC!int(hex, 16u, overflow); if(overflow) { error("While scanning a double quoted scalar", startMark, @@ -1618,12 +1631,11 @@ final class Scanner "hexadecimal numbers.", reader_.mark); return; } - reader_.sliceBuilder.write(decoded); } else if("\n\r\u0085\u2028\u2029"d.canFind(c)) { - scanLineBreak(); - scanFlowScalarBreaksToSlice(startMark); + scanLineBreak8(); + scanFlowScalarBreaksToSlice8(startMark); if(error_) { return; } } else @@ -1644,15 +1656,16 @@ final class Scanner /// spaces into that slice. /// /// In case of an error, error_ is set. Use throwIfError() to handle this. - void scanFlowScalarSpacesToSlice(const Mark startMark) + void scanFlowScalarSpacesToSlice8(const Mark startMark) @system pure nothrow @nogc { // Increase length as long as we see whitespace. size_t length = 0; while(" \t"d.canFind(reader_.peek(length))) { ++length; } - auto whitespaces = reader_.prefix(length + 1); + auto whitespaces = reader_.prefix8(length); - const c = whitespaces[$ - 1]; + // Can check the last byte without striding because '\0' is ASCII + const c = reader_.peek(length); if(c == '\0') { error("While scanning a quoted scalar", startMark, @@ -1664,23 +1677,23 @@ final class Scanner if(!"\n\r\u0085\u2028\u2029"d.canFind(c)) { reader_.forward(length); - reader_.sliceBuilder.write(whitespaces[0 .. $ - 1]); + reader_.sliceBuilder8.write(whitespaces); return; } // There's a line break after the spaces. reader_.forward(length); - const lineBreak = scanLineBreak(); + const lineBreak = scanLineBreak8(); - if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } + if(lineBreak != '\n') { reader_.sliceBuilder8.write(lineBreak); } // If we have extra line breaks after the first, scan them into the // slice. - const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); + const bool extraBreaks = scanFlowScalarBreaksToSlice8(startMark); if(error_) { return; } // No extra breaks, one normal line break. Replace it with a space. - if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } + if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder8.write(' '); } } /// Scan line breaks in a flow scalar. @@ -1689,7 +1702,7 @@ final class Scanner /// line breaks into that slice. /// /// In case of an error, error_ is set. Use throwIfError() to handle this. - bool scanFlowScalarBreaksToSlice(const Mark startMark) + bool scanFlowScalarBreaksToSlice8(const Mark startMark) @system pure nothrow @nogc { // True if at least one line break was found. @@ -1697,8 +1710,8 @@ final class Scanner for(;;) { // Instead of checking indentation, we check for document separators. - const prefix = reader_.prefix(3); - if((prefix == "---"d || prefix == "..."d) && + const prefix = reader_.prefix8(3); + if((prefix == "---" || prefix == "...") && " \t\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(3))) { error("While scanning a quoted scalar", startMark, @@ -1712,9 +1725,9 @@ final class Scanner // Encountered a non-whitespace non-linebreak character, so we're done. if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; } - const lineBreak = scanLineBreak(); + const lineBreak = scanLineBreak8(); anyBreaks = true; - reader_.sliceBuilder.write(lineBreak); + reader_.sliceBuilder8.write(lineBreak); } return anyBreaks; }