UTF-8 scanFlowScalar. **NOTE:** moved escaping to Parser; can't do it in-place
This commit is contained in:
parent
252bf083a7
commit
952726aa5e
|
@ -11,6 +11,7 @@
|
||||||
module dyaml.parser;
|
module dyaml.parser;
|
||||||
|
|
||||||
|
|
||||||
|
import std.algorithm;
|
||||||
import std.array;
|
import std.array;
|
||||||
import std.container;
|
import std.container;
|
||||||
import std.conv;
|
import std.conv;
|
||||||
|
@ -411,7 +412,8 @@ final class Parser
|
||||||
|
|
||||||
///Parse a node.
|
///Parse a node.
|
||||||
Event parseNode(const Flag!"block" block,
|
Event parseNode(const Flag!"block" block,
|
||||||
const Flag!"indentlessSequence" indentlessSequence = No.indentlessSequence) @safe
|
const Flag!"indentlessSequence" indentlessSequence = No.indentlessSequence)
|
||||||
|
@trusted
|
||||||
{
|
{
|
||||||
if(scanner_.checkToken(TokenID.Alias))
|
if(scanner_.checkToken(TokenID.Alias))
|
||||||
{
|
{
|
||||||
|
@ -468,12 +470,15 @@ final class Parser
|
||||||
if(scanner_.checkToken(TokenID.Scalar))
|
if(scanner_.checkToken(TokenID.Scalar))
|
||||||
{
|
{
|
||||||
immutable token = scanner_.getToken();
|
immutable token = scanner_.getToken();
|
||||||
|
auto value = token.style == ScalarStyle.DoubleQuoted
|
||||||
|
? handleDoubleQuotedScalarEscapes(token.value)
|
||||||
|
: token.value;
|
||||||
|
|
||||||
implicit = (token.style == ScalarStyle.Plain && tag is null) || tag == "!";
|
implicit = (token.style == ScalarStyle.Plain && tag is null) || tag == "!";
|
||||||
bool implicit_2 = (!implicit) && tag is null;
|
bool implicit_2 = (!implicit) && tag is null;
|
||||||
state_ = popState();
|
state_ = popState();
|
||||||
return scalarEvent(startMark, token.endMark, Anchor(anchor), Tag(tag),
|
return scalarEvent(startMark, token.endMark, Anchor(anchor), Tag(tag),
|
||||||
tuple(implicit, implicit_2), token.value, token.style);
|
tuple(implicit, implicit_2), value, token.style);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(scanner_.checkToken(TokenID.FlowSequenceStart))
|
if(scanner_.checkToken(TokenID.FlowSequenceStart))
|
||||||
|
@ -526,6 +531,84 @@ final class Parser
|
||||||
~ token.idString, token.startMark);
|
~ token.idString, token.startMark);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Handle escape sequences in a double quoted scalar.
|
||||||
|
///
|
||||||
|
/// Moved here from scanner as it can't always be done in-place with slices.
|
||||||
|
string handleDoubleQuotedScalarEscapes(string tokenValue)
|
||||||
|
{
|
||||||
|
string notInPlace;
|
||||||
|
bool inEscape = false;
|
||||||
|
import dyaml.nogcutil;
|
||||||
|
auto appender = appenderNoGC(cast(char[])tokenValue);
|
||||||
|
for(string oldValue = tokenValue; !oldValue.empty();)
|
||||||
|
{
|
||||||
|
const dchar c = oldValue.front();
|
||||||
|
oldValue.popFront();
|
||||||
|
|
||||||
|
if(!inEscape)
|
||||||
|
{
|
||||||
|
if(c != '\\')
|
||||||
|
{
|
||||||
|
if(notInPlace is null) { appender.putDChar(c); }
|
||||||
|
else { notInPlace ~= c; }
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Escape sequence starts with a '\'
|
||||||
|
inEscape = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
import dyaml.escapes;
|
||||||
|
scope(exit) { inEscape = false; }
|
||||||
|
|
||||||
|
// 'Normal' escape sequence.
|
||||||
|
if(dyaml.escapes.escapes.canFind(c))
|
||||||
|
{
|
||||||
|
if(notInPlace is null)
|
||||||
|
{
|
||||||
|
// \L and \C can't be handled in place as the expand into
|
||||||
|
// many-byte unicode chars
|
||||||
|
if(c != 'L' && c != 'P')
|
||||||
|
{
|
||||||
|
appender.putDChar(dyaml.escapes.fromEscape(c));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Need to duplicate as we won't fit into
|
||||||
|
// token.value - which is what appender uses
|
||||||
|
notInPlace = appender.data.dup;
|
||||||
|
notInPlace ~= dyaml.escapes.fromEscape(c);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
notInPlace ~= dyaml.escapes.fromEscape(c);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unicode char written in hexadecimal in an escape sequence.
|
||||||
|
if(dyaml.escapes.escapeHexCodeList.canFind(c))
|
||||||
|
{
|
||||||
|
// Scanner has already checked that the hex string is valid.
|
||||||
|
|
||||||
|
const hexLength = dyaml.escapes.escapeHexLength(c);
|
||||||
|
// Any hex digits are 1-byte so this works.
|
||||||
|
string hex = oldValue[0 .. hexLength];
|
||||||
|
oldValue = oldValue[hexLength .. $];
|
||||||
|
assert(!hex.canFind!(d => !d.isHexDigit),
|
||||||
|
"Scanner must ensure the hex string is valid");
|
||||||
|
|
||||||
|
bool overflow;
|
||||||
|
const decoded = cast(dchar)parseNoGC!int(hex, 16u, overflow);
|
||||||
|
assert(!overflow, "Scanner must ensure there's no overflow");
|
||||||
|
if(notInPlace is null) { appender.putDChar(decoded); }
|
||||||
|
else { notInPlace ~= decoded; }
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(false, "Scanner must handle unsupported escapes");
|
||||||
|
}
|
||||||
|
|
||||||
|
return notInPlace is null ? cast(string)appender.data : notInPlace;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Process a tag string retrieved from a tag token.
|
* Process a tag string retrieved from a tag token.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1519,23 +1519,23 @@ final class Scanner
|
||||||
const startMark = reader_.mark;
|
const startMark = reader_.mark;
|
||||||
const quote = reader_.get();
|
const quote = reader_.get();
|
||||||
|
|
||||||
reader_.sliceBuilder.begin();
|
reader_.sliceBuilder8.begin();
|
||||||
scope(exit) if(error_) { reader_.sliceBuilder.finish(); }
|
scope(exit) if(error_) { reader_.sliceBuilder8.finish(); }
|
||||||
|
|
||||||
scanFlowScalarNonSpacesToSlice(quotes, startMark);
|
scanFlowScalarNonSpacesToSlice8(quotes, startMark);
|
||||||
if(error_) { return Token.init; }
|
if(error_) { return Token.init; }
|
||||||
|
|
||||||
while(reader_.peek() != quote)
|
while(reader_.peek() != quote)
|
||||||
{
|
{
|
||||||
scanFlowScalarSpacesToSlice(startMark);
|
scanFlowScalarSpacesToSlice8(startMark);
|
||||||
if(error_) { return Token.init; }
|
if(error_) { return Token.init; }
|
||||||
scanFlowScalarNonSpacesToSlice(quotes, startMark);
|
scanFlowScalarNonSpacesToSlice8(quotes, startMark);
|
||||||
if(error_) { return Token.init; }
|
if(error_) { return Token.init; }
|
||||||
}
|
}
|
||||||
reader_.forward();
|
reader_.forward();
|
||||||
|
|
||||||
auto slice = reader_.sliceBuilder.finish();
|
auto slice = reader_.sliceBuilder8.finish();
|
||||||
return scalarToken(startMark, reader_.mark, slice.utf32To8, quotes);
|
return scalarToken(startMark, reader_.mark, slice, quotes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scan nonspace characters in a flow scalar.
|
/// Scan nonspace characters in a flow scalar.
|
||||||
|
@ -1544,7 +1544,7 @@ final class Scanner
|
||||||
/// characters into that slice.
|
/// characters into that slice.
|
||||||
///
|
///
|
||||||
/// In case of an error, error_ is set. Use throwIfError() to handle this.
|
/// In case of an error, error_ is set. Use throwIfError() to handle this.
|
||||||
void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark)
|
void scanFlowScalarNonSpacesToSlice8(const ScalarStyle quotes, const Mark startMark)
|
||||||
@system pure nothrow @nogc
|
@system pure nothrow @nogc
|
||||||
{
|
{
|
||||||
for(;;) with(ScalarStyle)
|
for(;;) with(ScalarStyle)
|
||||||
|
@ -1558,33 +1558,35 @@ final class Scanner
|
||||||
// while(!search.canFind(reader_.peek(length))) { ++length; }
|
// while(!search.canFind(reader_.peek(length))) { ++length; }
|
||||||
outer: for(;;)
|
outer: for(;;)
|
||||||
{
|
{
|
||||||
const slice = reader_.slice(length, length + 32);
|
const char[] slice = reader_.slice8(length + 32);
|
||||||
if(slice.empty)
|
if(slice.length == length)
|
||||||
{
|
{
|
||||||
error("While reading a flow scalar", startMark,
|
error("While reading a flow scalar", startMark,
|
||||||
"reached end of file", reader_.mark);
|
"reached end of file", reader_.mark);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
foreach(ch; slice)
|
for(size_t i = length; i < slice.length;)
|
||||||
{
|
{
|
||||||
|
// slice is UTF-8 - need to decode
|
||||||
|
const ch = slice[i] < 0x80 ? slice[i++] : decodeValidUTF8NoGC(slice, i);
|
||||||
if(search.canFind(ch)) { break outer; }
|
if(search.canFind(ch)) { break outer; }
|
||||||
++length;
|
++length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
reader_.sliceBuilder.write(reader_.get(length));
|
reader_.sliceBuilder8.write(reader_.get8(length));
|
||||||
|
|
||||||
c = reader_.peek();
|
c = reader_.peek();
|
||||||
if(quotes == SingleQuoted && c == '\'' && reader_.peek(1) == '\'')
|
if(quotes == SingleQuoted && c == '\'' && reader_.peek(1) == '\'')
|
||||||
{
|
{
|
||||||
reader_.forward(2);
|
reader_.forward(2);
|
||||||
reader_.sliceBuilder.write('\'');
|
reader_.sliceBuilder8.write('\'');
|
||||||
}
|
}
|
||||||
else if((quotes == DoubleQuoted && c == '\'') ||
|
else if((quotes == DoubleQuoted && c == '\'') ||
|
||||||
(quotes == SingleQuoted && "\"\\"d.canFind(c)))
|
(quotes == SingleQuoted && "\"\\"d.canFind(c)))
|
||||||
{
|
{
|
||||||
reader_.forward();
|
reader_.forward();
|
||||||
reader_.sliceBuilder.write(c);
|
reader_.sliceBuilder8.write(c);
|
||||||
}
|
}
|
||||||
else if(quotes == DoubleQuoted && c == '\\')
|
else if(quotes == DoubleQuoted && c == '\\')
|
||||||
{
|
{
|
||||||
|
@ -1593,24 +1595,35 @@ final class Scanner
|
||||||
if(dyaml.escapes.escapes.canFind(c))
|
if(dyaml.escapes.escapes.canFind(c))
|
||||||
{
|
{
|
||||||
reader_.forward();
|
reader_.forward();
|
||||||
reader_.sliceBuilder.write(dyaml.escapes.fromEscape(c));
|
// Escaping has been moved to Parser as it can't be done in
|
||||||
|
// place (in a slice) in case of '\P' and '\L' (very uncommon,
|
||||||
|
// but we don't want to break the spec)
|
||||||
|
char[2] escapeSequence = ['\\', cast(char)c];
|
||||||
|
reader_.sliceBuilder8.write(escapeSequence);
|
||||||
}
|
}
|
||||||
else if(dyaml.escapes.escapeHexCodeList.canFind(c))
|
else if(dyaml.escapes.escapeHexCodeList.canFind(c))
|
||||||
{
|
{
|
||||||
const hexLength = dyaml.escapes.escapeHexLength(c);
|
const hexLength = dyaml.escapes.escapeHexLength(c);
|
||||||
reader_.forward();
|
reader_.forward();
|
||||||
|
|
||||||
foreach(i; 0 .. hexLength) if(!reader_.peek(i).isHexDigit())
|
foreach(i; 0 .. hexLength) if(!reader_.peek(i).isHexDigit)
|
||||||
{
|
{
|
||||||
error("While scanning a double quoted scalar", startMark,
|
error("While scanning a double quoted scalar", startMark,
|
||||||
expected("escape sequence of hexadecimal numbers",
|
expected("escape sequence of hexadecimal numbers",
|
||||||
reader_.peek(i)), reader_.mark);
|
reader_.peek(i)), reader_.mark);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
char[] hex = reader_.get8(hexLength);
|
||||||
dchar[] hex = reader_.get(hexLength);
|
char[2] escapeStart = ['\\', cast(char) c];
|
||||||
|
reader_.sliceBuilder8.write(escapeStart);
|
||||||
|
reader_.sliceBuilder8.write(hex);
|
||||||
bool overflow;
|
bool overflow;
|
||||||
const decoded = cast(dchar)parseNoGC!int(hex, 16u, overflow);
|
// Note: This is just error checking; Parser does the actual
|
||||||
|
// escaping (otherwise we could accidentally create an
|
||||||
|
// escape sequence here that wasn't in input, breaking the
|
||||||
|
// escaping code in parser, which is in parser because it
|
||||||
|
// can't always be done in place)
|
||||||
|
parseNoGC!int(hex, 16u, overflow);
|
||||||
if(overflow)
|
if(overflow)
|
||||||
{
|
{
|
||||||
error("While scanning a double quoted scalar", startMark,
|
error("While scanning a double quoted scalar", startMark,
|
||||||
|
@ -1618,12 +1631,11 @@ final class Scanner
|
||||||
"hexadecimal numbers.", reader_.mark);
|
"hexadecimal numbers.", reader_.mark);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
reader_.sliceBuilder.write(decoded);
|
|
||||||
}
|
}
|
||||||
else if("\n\r\u0085\u2028\u2029"d.canFind(c))
|
else if("\n\r\u0085\u2028\u2029"d.canFind(c))
|
||||||
{
|
{
|
||||||
scanLineBreak();
|
scanLineBreak8();
|
||||||
scanFlowScalarBreaksToSlice(startMark);
|
scanFlowScalarBreaksToSlice8(startMark);
|
||||||
if(error_) { return; }
|
if(error_) { return; }
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1644,15 +1656,16 @@ final class Scanner
|
||||||
/// spaces into that slice.
|
/// spaces into that slice.
|
||||||
///
|
///
|
||||||
/// In case of an error, error_ is set. Use throwIfError() to handle this.
|
/// In case of an error, error_ is set. Use throwIfError() to handle this.
|
||||||
void scanFlowScalarSpacesToSlice(const Mark startMark)
|
void scanFlowScalarSpacesToSlice8(const Mark startMark)
|
||||||
@system pure nothrow @nogc
|
@system pure nothrow @nogc
|
||||||
{
|
{
|
||||||
// Increase length as long as we see whitespace.
|
// Increase length as long as we see whitespace.
|
||||||
size_t length = 0;
|
size_t length = 0;
|
||||||
while(" \t"d.canFind(reader_.peek(length))) { ++length; }
|
while(" \t"d.canFind(reader_.peek(length))) { ++length; }
|
||||||
auto whitespaces = reader_.prefix(length + 1);
|
auto whitespaces = reader_.prefix8(length);
|
||||||
|
|
||||||
const c = whitespaces[$ - 1];
|
// Can check the last byte without striding because '\0' is ASCII
|
||||||
|
const c = reader_.peek(length);
|
||||||
if(c == '\0')
|
if(c == '\0')
|
||||||
{
|
{
|
||||||
error("While scanning a quoted scalar", startMark,
|
error("While scanning a quoted scalar", startMark,
|
||||||
|
@ -1664,23 +1677,23 @@ final class Scanner
|
||||||
if(!"\n\r\u0085\u2028\u2029"d.canFind(c))
|
if(!"\n\r\u0085\u2028\u2029"d.canFind(c))
|
||||||
{
|
{
|
||||||
reader_.forward(length);
|
reader_.forward(length);
|
||||||
reader_.sliceBuilder.write(whitespaces[0 .. $ - 1]);
|
reader_.sliceBuilder8.write(whitespaces);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// There's a line break after the spaces.
|
// There's a line break after the spaces.
|
||||||
reader_.forward(length);
|
reader_.forward(length);
|
||||||
const lineBreak = scanLineBreak();
|
const lineBreak = scanLineBreak8();
|
||||||
|
|
||||||
if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
|
if(lineBreak != '\n') { reader_.sliceBuilder8.write(lineBreak); }
|
||||||
|
|
||||||
// If we have extra line breaks after the first, scan them into the
|
// If we have extra line breaks after the first, scan them into the
|
||||||
// slice.
|
// slice.
|
||||||
const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark);
|
const bool extraBreaks = scanFlowScalarBreaksToSlice8(startMark);
|
||||||
if(error_) { return; }
|
if(error_) { return; }
|
||||||
|
|
||||||
// No extra breaks, one normal line break. Replace it with a space.
|
// No extra breaks, one normal line break. Replace it with a space.
|
||||||
if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
|
if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder8.write(' '); }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scan line breaks in a flow scalar.
|
/// Scan line breaks in a flow scalar.
|
||||||
|
@ -1689,7 +1702,7 @@ final class Scanner
|
||||||
/// line breaks into that slice.
|
/// line breaks into that slice.
|
||||||
///
|
///
|
||||||
/// In case of an error, error_ is set. Use throwIfError() to handle this.
|
/// In case of an error, error_ is set. Use throwIfError() to handle this.
|
||||||
bool scanFlowScalarBreaksToSlice(const Mark startMark)
|
bool scanFlowScalarBreaksToSlice8(const Mark startMark)
|
||||||
@system pure nothrow @nogc
|
@system pure nothrow @nogc
|
||||||
{
|
{
|
||||||
// True if at least one line break was found.
|
// True if at least one line break was found.
|
||||||
|
@ -1697,8 +1710,8 @@ final class Scanner
|
||||||
for(;;)
|
for(;;)
|
||||||
{
|
{
|
||||||
// Instead of checking indentation, we check for document separators.
|
// Instead of checking indentation, we check for document separators.
|
||||||
const prefix = reader_.prefix(3);
|
const prefix = reader_.prefix8(3);
|
||||||
if((prefix == "---"d || prefix == "..."d) &&
|
if((prefix == "---" || prefix == "...") &&
|
||||||
" \t\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(3)))
|
" \t\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(3)))
|
||||||
{
|
{
|
||||||
error("While scanning a quoted scalar", startMark,
|
error("While scanning a quoted scalar", startMark,
|
||||||
|
@ -1712,9 +1725,9 @@ final class Scanner
|
||||||
// Encountered a non-whitespace non-linebreak character, so we're done.
|
// Encountered a non-whitespace non-linebreak character, so we're done.
|
||||||
if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; }
|
if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; }
|
||||||
|
|
||||||
const lineBreak = scanLineBreak();
|
const lineBreak = scanLineBreak8();
|
||||||
anyBreaks = true;
|
anyBreaks = true;
|
||||||
reader_.sliceBuilder.write(lineBreak);
|
reader_.sliceBuilder8.write(lineBreak);
|
||||||
}
|
}
|
||||||
return anyBreaks;
|
return anyBreaks;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue