Block scalar scanning now works with UTF-8.

This commit is contained in:
Ferdinand Majerech 2014-07-29 20:58:00 +02:00
parent 19ed03cb3e
commit 6837156258

View file

@ -853,14 +853,14 @@ final class Scanner
/// ///
/// Assumes that the caller is building a slice in Reader, and puts the scanned /// Assumes that the caller is building a slice in Reader, and puts the scanned
/// characters into that slice. /// characters into that slice.
void scanToNextBreakToSlice() @system pure nothrow @nogc void scanToNextBreakToSlice8() @system pure nothrow @nogc
{ {
uint length = 0; uint length = 0;
while(!"\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(length))) while(!"\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek(length)))
{ {
++length; ++length;
} }
reader_.sliceBuilder.write(reader_.get(length)); reader_.sliceBuilder8.write(reader_.get8(length));
} }
@ -1230,23 +1230,23 @@ final class Scanner
Mark endMark; Mark endMark;
uint indent = max(1, indent_ + 1); uint indent = max(1, indent_ + 1);
reader_.sliceBuilder.begin(); reader_.sliceBuilder8.begin();
alias Transaction = SliceBuilder.Transaction; alias Transaction = SliceBuilder8.Transaction;
// Used to strip the last line breaks written to the slice at the end of the // Used to strip the last line breaks written to the slice at the end of the
// scalar, which may be needed based on chomping. // scalar, which may be needed based on chomping.
Transaction breaksTransaction = Transaction(reader_.sliceBuilder); Transaction breaksTransaction = Transaction(reader_.sliceBuilder8);
// Read the first indentation/line breaks before the scalar. // Read the first indentation/line breaks before the scalar.
size_t startLen = reader_.sliceBuilder.length; size_t startLen = reader_.sliceBuilder8.length;
if(increment == int.min) if(increment == int.min)
{ {
auto indentation = scanBlockScalarIndentationToSlice(); auto indentation = scanBlockScalarIndentationToSlice8();
endMark = indentation[1]; endMark = indentation[1];
indent = max(indent, indentation[0]); indent = max(indent, indentation[0]);
} }
else else
{ {
indent += increment - 1; indent += increment - 1;
endMark = scanBlockScalarBreaksToSlice(indent); endMark = scanBlockScalarBreaksToSlice8(indent);
} }
// int.max means there's no line break (int.max is outside UTF-32). // int.max means there's no line break (int.max is outside UTF-32).
@ -1258,16 +1258,17 @@ final class Scanner
breaksTransaction.commit(); breaksTransaction.commit();
const bool leadingNonSpace = !" \t"d.canFind(reader_.peek()); const bool leadingNonSpace = !" \t"d.canFind(reader_.peek());
// This is where the 'interesting' non-whitespace data gets read. // This is where the 'interesting' non-whitespace data gets read.
scanToNextBreakToSlice(); scanToNextBreakToSlice8();
lineBreak = scanLineBreak(); lineBreak = scanLineBreak8();
// This transaction serves to rollback data read in the // This transaction serves to rollback data read in the
// scanBlockScalarBreaksToSlice() call. // scanBlockScalarBreaksToSlice() call.
breaksTransaction = Transaction(reader_.sliceBuilder); breaksTransaction = Transaction(reader_.sliceBuilder8);
startLen = reader_.sliceBuilder.length; startLen = reader_.sliceBuilder8.length;
// The line breaks should actually be written _after_ the if() block // The line breaks should actually be written _after_ the if() block
// below. We work around that by inserting // below. We work around that by inserting
endMark = scanBlockScalarBreaksToSlice(indent); endMark = scanBlockScalarBreaksToSlice8(indent);
// This will not run during the last iteration (see the if() vs the // This will not run during the last iteration (see the if() vs the
// while()), hence breaksTransaction rollback (which happens after this // while()), hence breaksTransaction rollback (which happens after this
@ -1282,16 +1283,16 @@ final class Scanner
{ {
// No breaks were scanned; no need to insert the space in the // No breaks were scanned; no need to insert the space in the
// middle of slice. // middle of slice.
if(startLen == reader_.sliceBuilder.length) if(startLen == reader_.sliceBuilder8.length)
{ {
reader_.sliceBuilder.write(' '); reader_.sliceBuilder8.write(' ');
} }
} }
else else
{ {
// We need to insert in the middle of the slice in case any line // We need to insert in the middle of the slice in case any line
// breaks were scanned. // breaks were scanned.
reader_.sliceBuilder.insert(lineBreak, startLen); reader_.sliceBuilder8.insert(lineBreak, startLen);
} }
////this is Clark Evans's interpretation (also in the spec ////this is Clark Evans's interpretation (also in the spec
@ -1303,7 +1304,7 @@ final class Scanner
// { // {
// if(!" \t"d.canFind(reader_.peek())) // if(!" \t"d.canFind(reader_.peek()))
// { // {
// reader_.sliceBuilder.write(' '); // reader_.sliceBuilder8.write(' ');
// } // }
// else // else
// { // {
@ -1313,7 +1314,7 @@ final class Scanner
//} //}
//else //else
//{ //{
// reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen); // reader_.sliceBuilder8.insertBack(lineBreak, endLen - startLen);
//} //}
} }
else else
@ -1334,19 +1335,19 @@ final class Scanner
// be inserted _before_ the other line breaks. // be inserted _before_ the other line breaks.
if(chomping == Chomping.Keep) if(chomping == Chomping.Keep)
{ {
reader_.sliceBuilder.insert(lineBreak, startLen); reader_.sliceBuilder8.insert(lineBreak, startLen);
} }
// If chomping is not Keep, breaksTransaction was cancelled so we can // If chomping is not Keep, breaksTransaction was cancelled so we can
// directly write the first line break (as it isn't stripped - chomping // directly write the first line break (as it isn't stripped - chomping
// is not Strip) // is not Strip)
else else
{ {
reader_.sliceBuilder.write(lineBreak); reader_.sliceBuilder8.write(lineBreak);
} }
} }
const slice = reader_.sliceBuilder.finish(); const slice = reader_.sliceBuilder8.finish();
return scalarToken(startMark, endMark, slice.utf32To8, style); return scalarToken(startMark, endMark, slice, style);
} }
/// Scan chomping and indentation indicators of a scalar token. /// Scan chomping and indentation indicators of a scalar token.
@ -1439,7 +1440,7 @@ final class Scanner
if("\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) if("\0\n\r\u0085\u2028\u2029"d.canFind(reader_.peek()))
{ {
scanLineBreak(); scanLineBreak8();
return; return;
} }
error("While scanning a block scalar", startMark, error("While scanning a block scalar", startMark,
@ -1450,7 +1451,7 @@ final class Scanner
/// ///
/// Assumes that the caller is building a slice in Reader, and puts the scanned /// Assumes that the caller is building a slice in Reader, and puts the scanned
/// characters into that slice. /// characters into that slice.
Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() Tuple!(uint, Mark) scanBlockScalarIndentationToSlice8()
@system pure nothrow @nogc @system pure nothrow @nogc
{ {
uint maxIndent; uint maxIndent;
@ -1460,7 +1461,7 @@ final class Scanner
{ {
if(reader_.peek() != ' ') if(reader_.peek() != ' ')
{ {
reader_.sliceBuilder.write(scanLineBreak()); reader_.sliceBuilder8.write(scanLineBreak8());
endMark = reader_.mark; endMark = reader_.mark;
continue; continue;
} }
@ -1475,7 +1476,7 @@ final class Scanner
/// ///
/// Assumes that the caller is building a slice in Reader, and puts the scanned /// Assumes that the caller is building a slice in Reader, and puts the scanned
/// characters into that slice. /// characters into that slice.
Mark scanBlockScalarBreaksToSlice(const uint indent) @trusted pure nothrow @nogc Mark scanBlockScalarBreaksToSlice8(const uint indent) @trusted pure nothrow @nogc
{ {
Mark endMark = reader_.mark; Mark endMark = reader_.mark;
@ -1483,7 +1484,7 @@ final class Scanner
{ {
while(reader_.column < indent && reader_.peek() == ' ') { reader_.forward(); } while(reader_.column < indent && reader_.peek() == ' ') { reader_.forward(); }
if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; } if(!"\n\r\u0085\u2028\u2029"d.canFind(reader_.peek())) { break; }
reader_.sliceBuilder.write(scanLineBreak()); reader_.sliceBuilder8.write(scanLineBreak8());
endMark = reader_.mark; endMark = reader_.mark;
} }