UTFDecoder now works directly with a buffer, not a stream.

This commit is contained in:
Ferdinand Majerech 2014-07-22 02:37:06 +02:00
parent 7d4df103a7
commit e30f6e0e80

View file

@ -401,10 +401,10 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
UTFEncoding encoding_; UTFEncoding encoding_;
// Maximum number of characters that might be in the stream. // Maximum number of characters that might be in the stream.
size_t maxChars_; size_t maxChars_;
// Bytes available in the stream. // The entire input buffer.
size_t available_; ubyte[] inputAll_;
// Input stream. // Part of the input buffer that has not yet been decoded.
MemoryStream stream_; ubyte[] input_;
// Buffer used to store raw UTF-8 or UTF-16 code points. // Buffer used to store raw UTF-8 or UTF-16 code points.
union union
@ -415,23 +415,23 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
// Used space (in items) in rawBuffer8_/rawBuffer16_. // Used space (in items) in rawBuffer8_/rawBuffer16_.
size_t rawUsed_; size_t rawUsed_;
// Space used by decodedBuffer_. // Space used by decoded_.
dchar[bufferSize_] decodedBufferSpace_; dchar[bufferSize_] decodedSpace_;
// Buffer of decoded, UTF-32 characters. This is a slice into decodedBufferSpace_. // Buffer of decoded, UTF-32 characters. This is a slice into decodedSpace_.
dchar[] decodedBuffer_; dchar[] decoded_;
public: public:
/// Construct a UTFBlockDecoder decoding data from a buffer. /// Construct a UTFBlockDecoder decoding data from a buffer.
this(ubyte[] buffer, UTFEncoding encoding) @trusted this(ubyte[] buffer, UTFEncoding encoding) @trusted
{ {
stream_ = new MemoryStream(buffer); inputAll_ = buffer;
available_ = stream_.available; input_ = inputAll_[];
encoding_ = encoding; encoding_ = encoding;
final switch(encoding_) final switch(encoding_)
{ {
case UTFEncoding.UTF_8: maxChars_ = available_; break; case UTFEncoding.UTF_8: maxChars_ = input_.length; break;
case UTFEncoding.UTF_16: maxChars_ = available_ / 2; break; case UTFEncoding.UTF_16: maxChars_ = input_.length / 2; break;
case UTFEncoding.UTF_32: maxChars_ = available_ / 2; break; case UTFEncoding.UTF_32: maxChars_ = input_.length / 2; break;
} }
} }
@ -441,27 +441,27 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
/// Get encoding we're decoding from. /// Get encoding we're decoding from.
UTFEncoding encoding() const pure @safe nothrow @nogc { return encoding_; } UTFEncoding encoding() const pure @safe nothrow @nogc { return encoding_; }
/// Get the current position in stream. /// Get the current position in buffer.
size_t position() @trusted { return stream_.position; } size_t position() @trusted { return inputAll_.length - input_.length; }
/// Are we done decoding? /// Are we done decoding?
bool done() const pure @safe nothrow @nogc bool done() const pure @safe nothrow @nogc
{ {
return rawUsed_ == 0 && decodedBuffer_.length == 0 && available_ == 0; return rawUsed_ == 0 && decoded_.length == 0 && input_.length == 0;
} }
/// Get next character. /// Get next character.
dchar getDChar() dchar getDChar()
@safe @safe
{ {
if(decodedBuffer_.length) if(decoded_.length)
{ {
const result = decodedBuffer_[0]; const result = decoded_[0];
decodedBuffer_ = decodedBuffer_[1 .. $]; decoded_ = decoded_[1 .. $];
return result; return result;
} }
assert(available_ > 0 || rawUsed_ > 0); assert(input_.length > 0 || rawUsed_ > 0);
updateBuffer(); updateBuffer();
return getDChar(); return getDChar();
} }
@ -470,15 +470,15 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
const(dchar[]) getDChars(size_t maxChars = size_t.max) const(dchar[]) getDChars(size_t maxChars = size_t.max)
@safe @safe
{ {
if(decodedBuffer_.length) if(decoded_.length)
{ {
const slice = min(decodedBuffer_.length, maxChars); const slice = min(decoded_.length, maxChars);
const result = decodedBuffer_[0 .. slice]; const result = decoded_[0 .. slice];
decodedBuffer_ = decodedBuffer_[slice .. $]; decoded_ = decoded_[slice .. $];
return result; return result;
} }
assert(available_ > 0 || rawUsed_ > 0); assert(input_.length > 0 || rawUsed_ > 0);
updateBuffer(); updateBuffer();
return getDChars(maxChars); return getDChars(maxChars);
} }
@ -487,37 +487,37 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
// Read and decode characters from file and store them in the buffer. // Read and decode characters from file and store them in the buffer.
void updateBuffer() @trusted void updateBuffer() @trusted
{ {
assert(decodedBuffer_.length == 0, assert(decoded_.length == 0,
"updateBuffer can only be called when the buffer is empty"); "updateBuffer can only be called when the buffer is empty");
final switch(encoding_) final switch(encoding_)
{ {
case UTFEncoding.UTF_8: case UTFEncoding.UTF_8:
const bytes = min(bufferSize_ - rawUsed_, available_); const bytes = min(bufferSize_ - rawUsed_, input_.length);
// Current length of valid data in rawBuffer8_. // Current length of valid data in rawBuffer8_.
const rawLength = rawUsed_ + bytes; const rawLength = rawUsed_ + bytes;
stream_.readExact(rawBuffer8_.ptr + rawUsed_, bytes); rawBuffer8_[rawUsed_ .. rawUsed_ + bytes] = cast(char[])input_[0 .. bytes];
available_ -= bytes; input_ = input_[bytes .. $];
decodeRawBuffer(rawBuffer8_, rawLength); decodeRawBuffer(rawBuffer8_, rawLength);
break; break;
case UTFEncoding.UTF_16: case UTFEncoding.UTF_16:
const words = min((bufferSize_ / 2) - rawUsed_, available_ / 2); const words = min((bufferSize_ / 2) - rawUsed_, input_.length / 2);
// Current length of valid data in rawBuffer16_. // Current length of valid data in rawBuffer16_.
const rawLength = rawUsed_ + words; const rawLength = rawUsed_ + words;
foreach(c; rawUsed_ .. rawLength) foreach(c; rawUsed_ .. rawLength)
{ {
stream_.read(rawBuffer16_[c]); rawBuffer16_[c] = *cast(wchar*)input_.ptr;
available_ -= 2; input_ = input_[2 .. $];
} }
decodeRawBuffer(rawBuffer16_, rawLength); decodeRawBuffer(rawBuffer16_, rawLength);
break; break;
case UTFEncoding.UTF_32: case UTFEncoding.UTF_32:
const chars = min(bufferSize_ / 4, available_ / 4); const chars = min(bufferSize_ / 4, input_.length / 4);
foreach(c; 0 .. chars) foreach(c; 0 .. chars)
{ {
stream_.read(decodedBufferSpace_[c]); decodedSpace_[c] = *cast(dchar*)input_.ptr;
available_ -= 4; input_ = input_[4 .. $];
} }
decodedBuffer_ = decodedBufferSpace_[0 .. chars]; decoded_ = decodedSpace_[0 .. chars];
break; break;
} }
} }
@ -584,15 +584,15 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
const c = source[srcpos]; const c = source[srcpos];
if(c < 0x80) if(c < 0x80)
{ {
decodedBufferSpace_[bufpos++] = c; decodedSpace_[bufpos++] = c;
++srcpos; ++srcpos;
} }
else else
{ {
decodedBufferSpace_[bufpos++] = decode(source, srcpos); decodedSpace_[bufpos++] = decode(source, srcpos);
} }
} }
decodedBuffer_ = decodedBufferSpace_[0 .. bufpos]; decoded_ = decodedSpace_[0 .. bufpos];
} }
} }