UTF-8 scanFlowScalar. **NOTE:** moved escaping to Parser; can't do it in-place

This commit is contained in:
Ferdinand Majerech 2014-07-29 03:18:37 +02:00
parent 252bf083a7
commit 952726aa5e
2 changed files with 133 additions and 37 deletions

View file

@ -11,6 +11,7 @@
module dyaml.parser; module dyaml.parser;
import std.algorithm;
import std.array; import std.array;
import std.container; import std.container;
import std.conv; import std.conv;
@ -411,7 +412,8 @@ final class Parser
///Parse a node. ///Parse a node.
Event parseNode(const Flag!"block" block, Event parseNode(const Flag!"block" block,
const Flag!"indentlessSequence" indentlessSequence = No.indentlessSequence) @safe const Flag!"indentlessSequence" indentlessSequence = No.indentlessSequence)
@trusted
{ {
if(scanner_.checkToken(TokenID.Alias)) if(scanner_.checkToken(TokenID.Alias))
{ {
@ -468,12 +470,15 @@ final class Parser
if(scanner_.checkToken(TokenID.Scalar)) if(scanner_.checkToken(TokenID.Scalar))
{ {
immutable token = scanner_.getToken(); immutable token = scanner_.getToken();
auto value = token.style == ScalarStyle.DoubleQuoted
? handleDoubleQuotedScalarEscapes(token.value)
: token.value;
implicit = (token.style == ScalarStyle.Plain && tag is null) || tag == "!"; implicit = (token.style == ScalarStyle.Plain && tag is null) || tag == "!";
bool implicit_2 = (!implicit) && tag is null; bool implicit_2 = (!implicit) && tag is null;
state_ = popState(); state_ = popState();
return scalarEvent(startMark, token.endMark, Anchor(anchor), Tag(tag), return scalarEvent(startMark, token.endMark, Anchor(anchor), Tag(tag),
tuple(implicit, implicit_2), token.value, token.style); tuple(implicit, implicit_2), value, token.style);
} }
if(scanner_.checkToken(TokenID.FlowSequenceStart)) if(scanner_.checkToken(TokenID.FlowSequenceStart))
@ -526,6 +531,84 @@ final class Parser
~ token.idString, token.startMark); ~ token.idString, token.startMark);
} }
/// Handle escape sequences in a double quoted scalar.
///
/// Moved here from scanner as it can't always be done in-place with slices.
string handleDoubleQuotedScalarEscapes(string tokenValue)
{
string notInPlace;
bool inEscape = false;
import dyaml.nogcutil;
auto appender = appenderNoGC(cast(char[])tokenValue);
for(string oldValue = tokenValue; !oldValue.empty();)
{
const dchar c = oldValue.front();
oldValue.popFront();
if(!inEscape)
{
if(c != '\\')
{
if(notInPlace is null) { appender.putDChar(c); }
else { notInPlace ~= c; }
continue;
}
// Escape sequence starts with a '\'
inEscape = true;
continue;
}
import dyaml.escapes;
scope(exit) { inEscape = false; }
// 'Normal' escape sequence.
if(dyaml.escapes.escapes.canFind(c))
{
if(notInPlace is null)
{
// \L and \C can't be handled in place as the expand into
// many-byte unicode chars
if(c != 'L' && c != 'P')
{
appender.putDChar(dyaml.escapes.fromEscape(c));
continue;
}
// Need to duplicate as we won't fit into
// token.value - which is what appender uses
notInPlace = appender.data.dup;
notInPlace ~= dyaml.escapes.fromEscape(c);
continue;
}
notInPlace ~= dyaml.escapes.fromEscape(c);
continue;
}
// Unicode char written in hexadecimal in an escape sequence.
if(dyaml.escapes.escapeHexCodeList.canFind(c))
{
// Scanner has already checked that the hex string is valid.
const hexLength = dyaml.escapes.escapeHexLength(c);
// Any hex digits are 1-byte so this works.
string hex = oldValue[0 .. hexLength];
oldValue = oldValue[hexLength .. $];
assert(!hex.canFind!(d => !d.isHexDigit),
"Scanner must ensure the hex string is valid");
bool overflow;
const decoded = cast(dchar)parseNoGC!int(hex, 16u, overflow);
assert(!overflow, "Scanner must ensure there's no overflow");
if(notInPlace is null) { appender.putDChar(decoded); }
else { notInPlace ~= decoded; }
continue;
}
assert(false, "Scanner must handle unsupported escapes");
}
return notInPlace is null ? cast(string)appender.data : notInPlace;
}
/** /**
* Process a tag string retrieved from a tag token. * Process a tag string retrieved from a tag token.
* *

View file

@ -1519,23 +1519,23 @@ final class Scanner
const startMark = reader_.mark; const startMark = reader_.mark;
const quote = reader_.get(); const quote = reader_.get();
reader_.sliceBuilder.begin(); reader_.sliceBuilder8.begin();
scope(exit) if(error_) { reader_.sliceBuilder.finish(); } scope(exit) if(error_) { reader_.sliceBuilder8.finish(); }
scanFlowScalarNonSpacesToSlice(quotes, startMark); scanFlowScalarNonSpacesToSlice8(quotes, startMark);
if(error_) { return Token.init; } if(error_) { return Token.init; }
while(reader_.peek() != quote) while(reader_.peek() != quote)
{ {
scanFlowScalarSpacesToSlice(startMark); scanFlowScalarSpacesToSlice8(startMark);
if(error_) { return Token.init; } if(error_) { return Token.init; }
scanFlowScalarNonSpacesToSlice(quotes, startMark); scanFlowScalarNonSpacesToSlice8(quotes, startMark);
if(error_) { return Token.init; } if(error_) { return Token.init; }
} }
reader_.forward(); reader_.forward();
auto slice = reader_.sliceBuilder.finish(); auto slice = reader_.sliceBuilder8.finish();
return scalarToken(startMark, reader_.mark, slice.utf32To8, quotes); return scalarToken(startMark, reader_.mark, slice, quotes);
} }
/// Scan nonspace characters in a flow scalar. /// Scan nonspace characters in a flow scalar.
@ -1544,7 +1544,7 @@ final class Scanner
/// characters into that slice. /// characters into that slice.
/// ///
/// In case of an error, error_ is set. Use throwIfError() to handle this. /// In case of an error, error_ is set. Use throwIfError() to handle this.
void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) void scanFlowScalarNonSpacesToSlice8(const ScalarStyle quotes, const Mark startMark)
@system pure nothrow @nogc @system pure nothrow @nogc
{ {
for(;;) with(ScalarStyle) for(;;) with(ScalarStyle)
@ -1558,33 +1558,35 @@ final class Scanner
// while(!search.canFind(reader_.peek(length))) { ++length; } // while(!search.canFind(reader_.peek(length))) { ++length; }
outer: for(;;) outer: for(;;)
{ {
const slice = reader_.slice(length, length + 32); const char[] slice = reader_.slice8(length + 32);
if(slice.empty) if(slice.length == length)
{ {
error("While reading a flow scalar", startMark, error("While reading a flow scalar", startMark,
"reached end of file", reader_.mark); "reached end of file", reader_.mark);
return; return;
} }
foreach(ch; slice) for(size_t i = length; i < slice.length;)
{ {
// slice is UTF-8 - need to decode
const ch = slice[i] < 0x80 ? slice[i++] : decodeValidUTF8NoGC(slice, i);
if(search.canFind(ch)) { break outer; } if(search.canFind(ch)) { break outer; }
++length; ++length;
} }
} }
reader_.sliceBuilder.write(reader_.get(length)); reader_.sliceBuilder8.write(reader_.get8(length));
c = reader_.peek(); c = reader_.peek();
if(quotes == SingleQuoted && c == '\'' && reader_.peek(1) == '\'') if(quotes == SingleQuoted && c == '\'' && reader_.peek(1) == '\'')
{ {
reader_.forward(2); reader_.forward(2);
reader_.sliceBuilder.write('\''); reader_.sliceBuilder8.write('\'');
} }
else if((quotes == DoubleQuoted && c == '\'') || else if((quotes == DoubleQuoted && c == '\'') ||
(quotes == SingleQuoted && "\"\\"d.canFind(c))) (quotes == SingleQuoted && "\"\\"d.canFind(c)))
{ {
reader_.forward(); reader_.forward();
reader_.sliceBuilder.write(c); reader_.sliceBuilder8.write(c);
} }
else if(quotes == DoubleQuoted && c == '\\') else if(quotes == DoubleQuoted && c == '\\')
{ {
@ -1593,24 +1595,35 @@ final class Scanner
if(dyaml.escapes.escapes.canFind(c)) if(dyaml.escapes.escapes.canFind(c))
{ {
reader_.forward(); reader_.forward();
reader_.sliceBuilder.write(dyaml.escapes.fromEscape(c)); // Escaping has been moved to Parser as it can't be done in
// place (in a slice) in case of '\P' and '\L' (very uncommon,
// but we don't want to break the spec)
char[2] escapeSequence = ['\\', cast(char)c];
reader_.sliceBuilder8.write(escapeSequence);
} }
else if(dyaml.escapes.escapeHexCodeList.canFind(c)) else if(dyaml.escapes.escapeHexCodeList.canFind(c))
{ {
const hexLength = dyaml.escapes.escapeHexLength(c); const hexLength = dyaml.escapes.escapeHexLength(c);
reader_.forward(); reader_.forward();
foreach(i; 0 .. hexLength) if(!reader_.peek(i).isHexDigit()) foreach(i; 0 .. hexLength) if(!reader_.peek(i).isHexDigit)
{ {
error("While scanning a double quoted scalar", startMark, error("While scanning a double quoted scalar", startMark,
expected("escape sequence of hexadecimal numbers", expected("escape sequence of hexadecimal numbers",
reader_.peek(i)), reader_.mark); reader_.peek(i)), reader_.mark);
return; return;
} }
char[] hex = reader_.get8(hexLength);
dchar[] hex = reader_.get(hexLength); char[2] escapeStart = ['\\', cast(char) c];
reader_.sliceBuilder8.write(escapeStart);
reader_.sliceBuilder8.write(hex);
bool overflow; bool overflow;
const decoded = cast(dchar)parseNoGC!int(hex, 16u, overflow); // Note: This is just error checking; Parser does the actual
// escaping (otherwise we could accidentally create an
// escape sequence here that wasn't in input, breaking the
// escaping code in parser, which is in parser because it
// can't always be done in place)
parseNoGC!int(hex, 16u, overflow);
if(overflow) if(overflow)
{ {
error("While scanning a double quoted scalar", startMark, error("While scanning a double quoted scalar", startMark,
@ -1618,12 +1631,11 @@ final class Scanner
"hexadecimal numbers.", reader_.mark); "hexadecimal numbers.", reader_.mark);
return; return;
} }
reader_.sliceBuilder.write(decoded);
} }
else if("\n\r\u0085\u2028\u2029"d.canFind(c)) else if("\n\r\u0085\u2028\u2029"d.canFind(c))
{ {
scanLineBreak(); scanLineBreak8();
scanFlowScalarBreaksToSlice(startMark); scanFlowScalarBreaksToSlice8(startMark);
if(error_) { return; } if(error_) { return; }
} }
else else
@ -1644,15 +1656,16 @@ final class Scanner
/// spaces into that slice. /// spaces into that slice.
/// ///
/// In case of an error, error_ is set. Use throwIfError() to handle this. /// In case of an error, error_ is set. Use throwIfError() to handle this.
void scanFlowScalarSpacesToSlice(const Mark startMark) void scanFlowScalarSpacesToSlice8(const Mark startMark)
@system pure nothrow @nogc @system pure nothrow @nogc
{ {
// Increase length as long as we see whitespace. // Increase length as long as we see whitespace.
size_t length = 0; size_t length = 0;
while(" \t"d.canFind(reader_.peek(length))) { ++length; } while(" \t"d.canFind(reader_.peek(length))) { ++length; }
auto whitespaces = reader_.prefix(length + 1); auto whitespaces = reader_.prefix8(length);
const c = whitespaces[$ - 1]; // Can check the last byte without striding because '\0' is ASCII
const c = reader_.peek(length);
if(c == '\0') if(c == '\0')
{ {
error("While scanning a quoted scalar", startMark, error("While scanning a quoted scalar", startMark,
@ -1664,23 +1677,23 @@ final class Scanner
if(!"\n\r\u0085\u2028\u2029"d.canFind(c)) if(!"\n\r\u0085\u2028\u2029"d.canFind(c))
{ {
reader_.forward(length); reader_.forward(length);
reader_.sliceBuilder.write(whitespaces[0 .. $ - 1]); reader_.sliceBuilder8.write(whitespaces);
return; return;
} }
// There's a line break after the spaces. // There's a line break after the spaces.
reader_.forward(length); reader_.forward(length);
const lineBreak = scanLineBreak(); const lineBreak = scanLineBreak8();
if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } if(lineBreak != '\n') { reader_.sliceBuilder8.write(lineBreak); }
// If we have extra line breaks after the first, scan them into the // If we have extra line breaks after the first, scan them into the
// slice. // slice.
const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); const bool extraBreaks = scanFlowScalarBreaksToSlice8(startMark);
if(error_) { return; } if(error_) { return; }
// No extra breaks, one normal line break. Replace it with a space. // No extra breaks, one normal line break. Replace it with a space.
if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder8.write(' '); }
} }
/// Scan line breaks in a flow scalar. /// Scan line breaks in a flow scalar.
@ -1689,7 +1702,7 @@ final class Scanner
/// line breaks into that slice. /// line breaks into that slice.
/// ///
/// In case of an error, error_ is set. Use throwIfError() to handle this. /// In case of an error, error_ is set. Use throwIfError() to handle this.
bool scanFlowScalarBreaksToSlice(const Mark startMark) bool scanFlowScalarBreaksToSlice8(const Mark startMark)
@system pure nothrow @nogc @system pure nothrow @nogc
{ {
// True if at least one line break was found. // True if at least one line break was found.
@ -1697,8 +1710,8 @@ final class Scanner
for(;;) for(;;)
{ {
// Instead of checking indentation, we check for document separators. // Instead of checking indentation, we check for document separators.
const prefix = reader_.prefix(3); const prefix = reader_.prefix8(3);
if((prefix == "---"d || prefix == "..."d) && if((prefix == "---" || prefix == "...") &&
" \t\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(3))) " \t\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(3)))
{ {
error("While scanning a quoted scalar", startMark, error("While scanning a quoted scalar", startMark,
@ -1712,9 +1725,9 @@ final class Scanner
// Encountered a non-whitespace non-linebreak character, so we're done. // Encountered a non-whitespace non-linebreak character, so we're done.
if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; } if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; }
const lineBreak = scanLineBreak(); const lineBreak = scanLineBreak8();
anyBreaks = true; anyBreaks = true;
reader_.sliceBuilder.write(lineBreak); reader_.sliceBuilder8.write(lineBreak);
} }
return anyBreaks; return anyBreaks;
} }