From 2c9d464389060c2308fb143056c1cc0f3344c9ac Mon Sep 17 00:00:00 2001 From: Ferdinand Majerech Date: Wed, 16 Nov 2011 03:10:29 +0100 Subject: [PATCH] Reader was reimplemented. After experiments with loading the whole file at once, and with decoding and parsing in separate thread, lazy reader turned to be the fastest/least memory intensive solution. Characters are now decoded in small batches. This improved parsing speed by ~20%. No global state anymore. Anchors are now zero terminated strings and TagDirectives are a simple array. Event structure was changed to prevent size increase. Minor fixes and improvements. --- doc/doctrees/environment.pickle | Bin 12705 -> 12705 bytes doc/html/api/dyaml.dumper.html | 22 +- doc/html/api/dyaml.loader.html | 13 +- doc/html/api/dyaml.representer.html | 6 +- doc/html/articles/spec_differences.html | 2 +- doc/html/index.html | 2 +- doc/html/search.html | 2 +- doc/html/tutorials/custom_types.html | 2 +- doc/html/tutorials/getting_started.html | 2 +- doc/html/tutorials/yaml_syntax.html | 2 +- dyaml/anchor.d | 21 +- dyaml/dumper.d | 36 +- dyaml/emitter.d | 36 +- dyaml/event.d | 103 ++-- dyaml/loader.d | 32 +- dyaml/parser.d | 33 +- dyaml/queue.d | 8 +- dyaml/reader.d | 652 +++++++++++++++--------- dyaml/representer.d | 12 +- dyaml/serializer.d | 6 +- dyaml/sharedobject.d | 131 ----- dyaml/tag.d | 65 +-- dyaml/tagdirective.d | 15 + dyaml/tagdirectives.d | 28 - dyaml/zerostring.d | 73 +++ test/src/compare.d | 6 +- test/src/emitter.d | 12 +- test/src/errors.d | 3 +- 28 files changed, 707 insertions(+), 618 deletions(-) delete mode 100644 dyaml/sharedobject.d create mode 100644 dyaml/tagdirective.d delete mode 100644 dyaml/tagdirectives.d create mode 100644 dyaml/zerostring.d diff --git a/doc/doctrees/environment.pickle b/doc/doctrees/environment.pickle index 6555f4a944e86a086570f3e21954b05102d086ef..a11ac2aa0d0e309c14b8d0fb14d66f9ac289a4c2 100644 GIT binary patch delta 63 zcmZ3OyfAq~tG>~O&AZ;}zsXQ`cf1T_KB;+{p#x#QQb@=!g)pB#=+3Z%FrOAOY~HT_ GiV*-kHX-2v delta 63 zcmZ3OyfAq~tG?0dMC$_*J{ijHj+cSV?Q8oobRf*VYTq(UA

Set stream name. Used in debugging messages.

-
void resolver(Resolver resolver); +
@property void resolver(Resolver resolver);

Specify custom Resolver to use.

-
void representer(Representer representer); +
@property void representer(Representer representer);

Specify custom Representer to use.

-
void canonical(in bool canonical); +
@property void canonical(bool canonical);

Write scalars in canonical form?

-
void indent(in uint indent); +
@property void indent(uint indent);

Set indentation width. 2 by default. Must not be zero.

-
void textWidth(in uint width); +
@property void textWidth(uint width);

Set preferred text width.

-
void lineBreak(in LineBreak lineBreak); +
@property void lineBreak(LineBreak lineBreak);

Set line break to use. Unix by default.

-
void encoding(in Encoding encoding); +
@property void encoding(Encoding encoding);

Set character encoding to use. UTF-8 by default.

-
void explicitStart(in bool explicit); +
@property void explicitStart(bool explicit);

Always explicitly write document start?

-
void explicitEnd(in bool explicit); +
@property void explicitEnd(bool explicit);

Always explicitly write document end?

-
void YAMLVersion(in string YAMLVersion); +
@property void YAMLVersion(string YAMLVersion);

Specify YAML version string. "1.1" by default.

-
void tagDirectives(string[string] tags); +
@property void tagDirectives(string[string] tags);

Specify tag directives.

diff --git a/doc/html/api/dyaml.loader.html b/doc/html/api/dyaml.loader.html index 3f79d7d..b36dd25 100644 --- a/doc/html/api/dyaml.loader.html +++ b/doc/html/api/dyaml.loader.html @@ -98,11 +98,11 @@ -
this(in const(immutable(char)[]) filename); +
this(string filename);

Construct a Loader to load YAML from a file.

-Parameters:
+Parameters:
const(immutable(char)[]) filename
string filename Name of the file to load from.
Throws:
YAMLException if the file could not be opened or read.
@@ -138,6 +138,9 @@

Load single YAML document.

If none or more than one YAML document is found, this throws a YAMLException. +
+ + This can only be called once; this is enforced by contract.

Returns:
Root node of the document. @@ -155,6 +158,9 @@ them all at once. Calling loadAll after iterating over the node or vice versa will not return any documents, as they have all been parsed already. +
+ + This can only be called once; this is enforced by contract.

Returns:
Array of root nodes of all documents in the file/stream. @@ -168,6 +174,9 @@

Foreach over YAML documents.

Parses documents lazily, when they are needed. +
+ + Foreach over a Loader can only be used once; this is enforced by contract.

Throws:
YAMLException on a parsing error.
diff --git a/doc/html/api/dyaml.representer.html b/doc/html/api/dyaml.representer.html index 62fe8b4..9acb9da 100644 --- a/doc/html/api/dyaml.representer.html +++ b/doc/html/api/dyaml.representer.html @@ -182,7 +182,7 @@
-
Node representScalar(in string tag, string scalar, ScalarStyle style = (ScalarStyle).Invalid); +
Node representScalar(string tag, string scalar, ScalarStyle style = (ScalarStyle).Invalid);

Represent a scalar with specified tag.

@@ -217,7 +217,7 @@

-
Node representSequence(in string tag, Node[] sequence, CollectionStyle style = (CollectionStyle).Invalid); +
Node representSequence(string tag, Node[] sequence, CollectionStyle style = (CollectionStyle).Invalid);

Represent a sequence with specified tag, representing children first.

@@ -256,7 +256,7 @@

-
Node representMapping(in string tag, Pair[] pairs, CollectionStyle style = (CollectionStyle).Invalid); +
Node representMapping(string tag, Pair[] pairs, CollectionStyle style = (CollectionStyle).Invalid);

Represent a mapping with specified tag, representing children first.

diff --git a/doc/html/articles/spec_differences.html b/doc/html/articles/spec_differences.html index 82539ed..71052fa 100644 --- a/doc/html/articles/spec_differences.html +++ b/doc/html/articles/spec_differences.html @@ -138,7 +138,7 @@ struct appears in Phobos.

diff --git a/doc/html/index.html b/doc/html/index.html index f2f52f4..8bc3e12 100644 --- a/doc/html/index.html +++ b/doc/html/index.html @@ -104,7 +104,7 @@ diff --git a/doc/html/search.html b/doc/html/search.html index d840f62..f6be88a 100644 --- a/doc/html/search.html +++ b/doc/html/search.html @@ -87,7 +87,7 @@ diff --git a/doc/html/tutorials/custom_types.html b/doc/html/tutorials/custom_types.html index 15c06d0..1314f85 100644 --- a/doc/html/tutorials/custom_types.html +++ b/doc/html/tutorials/custom_types.html @@ -368,7 +368,7 @@ directory of the D:YAML package.

diff --git a/doc/html/tutorials/getting_started.html b/doc/html/tutorials/getting_started.html index ed2d39c..5586a2c 100644 --- a/doc/html/tutorials/getting_started.html +++ b/doc/html/tutorials/getting_started.html @@ -237,7 +237,7 @@ example in the example/getting_st diff --git a/doc/html/tutorials/yaml_syntax.html b/doc/html/tutorials/yaml_syntax.html index 1005f1b..81228b5 100644 --- a/doc/html/tutorials/yaml_syntax.html +++ b/doc/html/tutorials/yaml_syntax.html @@ -330,7 +330,7 @@ Some of these might change in the future (especially !!map and !!set).

diff --git a/dyaml/anchor.d b/dyaml/anchor.d index 89e4d72..967af74 100644 --- a/dyaml/anchor.d +++ b/dyaml/anchor.d @@ -7,24 +7,7 @@ ///YAML anchor. module dyaml.anchor; -import dyaml.sharedobject; - +import dyaml.zerostring; ///YAML anchor (reference) struct. Encapsulates an anchor to save memory. -struct Anchor -{ - public: - mixin SharedObject!(string, Anchor); - - ///Construct an anchor from a string representation. - this(string anchor) - { - if(anchor is null || anchor == "") - { - index_ = uint.max; - return; - } - - add(anchor); - } -} +alias ZeroString!"Anchor" Anchor; diff --git a/dyaml/dumper.d b/dyaml/dumper.d index 1cc0f5c..8ca788f 100644 --- a/dyaml/dumper.d +++ b/dyaml/dumper.d @@ -25,7 +25,7 @@ import dyaml.node; import dyaml.representer; import dyaml.resolver; import dyaml.serializer; -import dyaml.tagdirectives; +import dyaml.tagdirective; /** @@ -146,7 +146,7 @@ struct Dumper ///YAML version string. string YAMLVersion_ = "1.1"; ///Tag directives to use. - TagDirectives tags_ = TagDirectives(); + TagDirective[] tags_ = null; ///Always write document start? bool explicitStart_ = false; ///Always write document end? @@ -167,7 +167,7 @@ struct Dumper * * Throws: YAMLException if the file can not be dumped to (e.g. cannot be opened). */ - this(in string filename) + this(string filename) { name_ = filename; try{this(new File(filename, FileMode.OutNew));} @@ -184,20 +184,16 @@ struct Dumper resolver_ = defaultResolver_; representer_ = defaultRepresenter_; stream_ = stream; - Anchor.addReference(); - TagDirectives.addReference(); } ///Destroy the Dumper. ~this() { - Anchor.removeReference(); - TagDirectives.removeReference(); YAMLVersion_ = null; } ///Set stream _name. Used in debugging messages. - @property void name(in string name) + @property void name(string name) { name_ = name; } @@ -217,13 +213,13 @@ struct Dumper } ///Write scalars in _canonical form? - @property void canonical(in bool canonical) + @property void canonical(bool canonical) { canonical_ = canonical; } ///Set indentation width. 2 by default. Must not be zero. - @property void indent(in uint indent) + @property void indent(uint indent) in { assert(indent != 0, "Can't use zero YAML indent width"); @@ -234,37 +230,37 @@ struct Dumper } ///Set preferred text _width. - @property void textWidth(in uint width) + @property void textWidth(uint width) { textWidth_ = width; } ///Set line break to use. Unix by default. - @property void lineBreak(in LineBreak lineBreak) + @property void lineBreak(LineBreak lineBreak) { lineBreak_ = lineBreak; } ///Set character _encoding to use. UTF-8 by default. - @property void encoding(in Encoding encoding) + @property void encoding(Encoding encoding) { encoding_ = encoding; } ///Always explicitly write document start? - @property void explicitStart(in bool explicit) + @property void explicitStart(bool explicit) { explicitStart_ = explicit; } ///Always explicitly write document end? - @property void explicitEnd(in bool explicit) + @property void explicitEnd(bool explicit) { explicitEnd_ = explicit; } ///Specify YAML version string. "1.1" by default. - @property void YAMLVersion(in string YAMLVersion) + @property void YAMLVersion(string YAMLVersion) { YAMLVersion_ = YAMLVersion; } @@ -301,16 +297,16 @@ struct Dumper */ @property void tagDirectives(string[string] tags) { - tagDirective[] t; + TagDirective[] t; foreach(handle, prefix; tags) { assert(handle.length >= 1 && handle[0] == '!' && handle[$ - 1] == '!', "A tag handle is empty or does not start and end with a " "'!' character : " ~ handle); assert(prefix.length >= 1, "A tag prefix is empty"); - t ~= tagDirective(handle, prefix); + t ~= TagDirective(handle, prefix); } - tags_ = TagDirectives(t); + tags_ = t; } /** @@ -352,7 +348,7 @@ struct Dumper * * Throws: YAMLException if unable to emit. */ - void emit(in Event[] events) + void emit(Event[] events) { try { diff --git a/dyaml/emitter.d b/dyaml/emitter.d index 9b19d81..5939258 100644 --- a/dyaml/emitter.d +++ b/dyaml/emitter.d @@ -71,11 +71,11 @@ private mixin FastCharSearch!"\n\u0085\u2028\u2029"d newlineSearch_; struct Emitter { private: - alias dyaml.tagdirectives.tagDirective tagDirective; + alias dyaml.tagdirective.TagDirective TagDirective; ///Default tag handle shortcuts and replacements. - static tagDirective[] defaultTagDirectives_ = - [tagDirective("!", "!"), tagDirective("!!", "tag:yaml.org,2002:")]; + static TagDirective[] defaultTagDirectives_ = + [TagDirective("!", "!"), TagDirective("!!", "tag:yaml.org,2002:")]; ///Stream to write to. Stream stream_; @@ -135,7 +135,7 @@ struct Emitter LineBreak bestLineBreak_; ///Tag directive handle - prefix pairs. - tagDirective[] tagDirectives_; + TagDirective[] tagDirectives_; ///Anchor/alias to process. string preparedAnchor_ = null; @@ -193,7 +193,7 @@ struct Emitter } ///Emit an event. Throws EmitterException on error. - void emit(immutable Event event) + void emit(Event event) { events_.push(event); while(!needMoreEvents()) @@ -254,7 +254,7 @@ struct Emitter { if(events_.length == 0){return true;} - immutable event = events_.peek(); + immutable event = cast(immutable Event)events_.peek(); if(event.id == EventID.DocumentStart){return needEvents(1);} if(event.id == EventID.SequenceStart){return needEvents(2);} if(event.id == EventID.MappingStart) {return needEvents(3);} @@ -274,7 +274,7 @@ struct Emitter events_.next(); while(!events_.iterationOver()) { - immutable event = events_.next(); + immutable event = cast(immutable Event)events_.next(); static starts = [EventID.DocumentStart, EventID.SequenceStart, EventID.MappingStart]; static ends = [EventID.DocumentEnd, EventID.SequenceEnd, EventID.MappingEnd]; if(starts.canFind(event.id)) {++level;} @@ -347,8 +347,8 @@ struct Emitter if(event_.id == EventID.DocumentStart) { const YAMLVersion = event_.value; - const tagDirectives = event_.tagDirectives; - if(openEnded_ && (YAMLVersion !is null || !tagDirectives.isNull())) + auto tagDirectives = event_.tagDirectives; + if(openEnded_ && (YAMLVersion !is null || tagDirectives !is null)) { writeIndicator("...", true); writeIndent(); @@ -359,10 +359,10 @@ struct Emitter writeVersionDirective(prepareVersion(YAMLVersion)); } - if(!tagDirectives.isNull()) + if(tagDirectives !is null) { - tagDirectives_ = tagDirectives.get; - sort!"icmp(a[0], b[0]) < 0"(tagDirectives_); + tagDirectives_ = tagDirectives; + sort!"icmp(a.handle, b.handle) < 0"(tagDirectives_); foreach(ref pair; tagDirectives_) { @@ -371,7 +371,7 @@ struct Emitter } } - bool eq(ref tagDirective a, ref tagDirective b){return a.handle == b.handle;} + bool eq(ref TagDirective a, ref TagDirective b){return a.handle == b.handle;} //Add any default tag directives that have not been overriden. foreach(ref def; defaultTagDirectives_) { @@ -382,7 +382,7 @@ struct Emitter } const implicit = first && !event_.explicitDocument && !canonical_ && - YAMLVersion is null && tagDirectives.isNull() && + YAMLVersion is null && tagDirectives is null && !checkEmptyDocument(); if(!implicit) { @@ -684,7 +684,7 @@ struct Emitter return false; } - immutable event = events_.peek(); + immutable event = cast(immutable Event)events_.peek(); const emptyScalar = event.id == EventID.Scalar && event.anchor.isNull() && event.tag.isNull() && event.implicit && event.value == ""; return emptyScalar; @@ -933,14 +933,14 @@ struct Emitter string suffix = tagString; //Sort lexicographically by prefix. - sort!"icmp(a[1], b[1]) < 0"(tagDirectives_); + sort!"icmp(a.prefix, b.prefix) < 0"(tagDirectives_); foreach(ref pair; tagDirectives_) { - auto prefix = pair[1]; + auto prefix = pair.prefix; if(tagString.startsWith(prefix) && (prefix != "!" || prefix.length < tagString.length)) { - handle = pair[0]; + handle = pair.handle; suffix = tagString[prefix.length .. $]; } } diff --git a/dyaml/event.d b/dyaml/event.d index 43232a3..bc18822 100644 --- a/dyaml/event.d +++ b/dyaml/event.d @@ -19,7 +19,7 @@ import dyaml.encoding; import dyaml.exception; import dyaml.reader; import dyaml.tag; -import dyaml.tagdirectives; +import dyaml.tagdirective; import dyaml.style; @@ -55,30 +55,40 @@ struct Event Mark startMark; ///End position of the event in file/stream. Mark endMark; - ///Anchor of the event, if any. - Anchor anchor; - ///Tag of the event, if any. - Tag tag; + union + { + struct + { + ///Anchor of the event, if any. + Anchor anchor; + ///Tag of the event, if any. + Tag tag; + } + ///Tag directives, if this is a DocumentStart. + //TagDirectives tagDirectives; + TagDirective[] tagDirectives; + } ///Event type. EventID id = EventID.Invalid; ///Style of scalar event, if this is a scalar event. - ScalarStyle scalarStyle; - ///Should the tag be implicitly resolved? - bool implicit; + ScalarStyle scalarStyle = ScalarStyle.Invalid; + union + { + ///Should the tag be implicitly resolved? + bool implicit; + /** + * Is this document event explicit? + * + * Used if this is a DocumentStart or DocumentEnd. + */ + bool explicitDocument; + } ///TODO figure this out - Unknown, used by PyYAML with Scalar events. bool implicit_2; - /** - * Is this document event explicit? - * - * Used if this is a DocumentStart or DocumentEnd. - */ - alias implicit explicitDocument; - ///Tag directives, if this is a DocumentStart. - TagDirectives tagDirectives; ///Encoding of the stream, if this is a StreamStart. Encoding encoding; ///Collection style, if this is a SequenceStart or MappingStart. - CollectionStyle collectionStyle; + CollectionStyle collectionStyle = CollectionStyle.Invalid; ///Is this a null (uninitialized) event? @property bool isNull() const {return id == EventID.Invalid;} @@ -96,7 +106,12 @@ struct Event */ Event event(EventID id)(in Mark start, in Mark end, in Anchor anchor = Anchor()) pure { - return Event(null, start, end, anchor, Tag(), id); + Event result; + result.startMark = start; + result.endMark = end; + result.anchor = anchor; + result.id = id; + return result; } /** @@ -114,8 +129,15 @@ Event collectionStartEvent(EventID id)(in Mark start, in Mark end, in Anchor anc { static assert(id == EventID.SequenceStart || id == EventID.SequenceEnd || id == EventID.MappingStart || id == EventID.MappingEnd); - return Event(null, start, end, anchor, tag, id, ScalarStyle.Invalid, implicit, - false, TagDirectives(), Encoding.UTF_8, style); + Event result; + result.startMark = start; + result.endMark = end; + result.anchor = anchor; + result.tag = tag; + result.id = id; + result.implicit = implicit; + result.collectionStyle = style; + return result; } /** @@ -127,8 +149,12 @@ Event collectionStartEvent(EventID id)(in Mark start, in Mark end, in Anchor anc */ Event streamStartEvent(in Mark start, in Mark end, in Encoding encoding) pure { - return Event(null, start, end, Anchor(), Tag(), EventID.StreamStart, - ScalarStyle.Invalid, false, false, TagDirectives(), encoding); + Event result; + result.startMark = start; + result.endMark = end; + result.id = EventID.StreamStart; + result.encoding = encoding; + return result; } ///Aliases for simple events. @@ -151,10 +177,16 @@ alias collectionStartEvent!(EventID.MappingStart) mappingStartEvent; * tagDirectives = Tag directives of the document. */ Event documentStartEvent(in Mark start, in Mark end, bool explicit, string YAMLVersion, - in TagDirectives tagDirectives) pure + TagDirective[] tagDirectives) pure { - return Event(YAMLVersion, start, end, Anchor(), Tag(), EventID.DocumentStart, - ScalarStyle.Invalid, explicit, false, tagDirectives); + Event result; + result.value = YAMLVersion; + result.startMark = start; + result.endMark = end; + result.id = EventID.DocumentStart; + result.explicitDocument = explicit; + result.tagDirectives = tagDirectives; + return result; } /** @@ -166,8 +198,12 @@ Event documentStartEvent(in Mark start, in Mark end, bool explicit, string YAMLV */ Event documentEndEvent(in Mark start, in Mark end, bool explicit) pure { - return Event(null, start, end, Anchor(), Tag(), EventID.DocumentEnd, - ScalarStyle.Invalid, explicit); + Event result; + result.startMark = start; + result.endMark = end; + result.id = EventID.DocumentEnd; + result.explicitDocument = explicit; + return result; } /** @@ -185,6 +221,15 @@ Event scalarEvent(in Mark start, in Mark end, in Anchor anchor, in Tag tag, in Tuple!(bool, bool) implicit, in string value, in ScalarStyle style = ScalarStyle.Invalid) pure { - return Event(value, start, end, anchor, tag, EventID.Scalar, style, implicit[0], - implicit[1]); + Event result; + result.value = value; + result.startMark = start; + result.endMark = end; + result.anchor = anchor; + result.tag = tag; + result.id = EventID.Scalar; + result.scalarStyle = style; + result.implicit = implicit[0]; + result.implicit_2 = implicit[1]; + return result; } diff --git a/dyaml/loader.d b/dyaml/loader.d index 006ecb0..d8149c1 100644 --- a/dyaml/loader.d +++ b/dyaml/loader.d @@ -13,7 +13,6 @@ module dyaml.loader; import std.exception; import std.stream; -import dyaml.anchor; import dyaml.composer; import dyaml.constructor; import dyaml.event; @@ -23,7 +22,6 @@ import dyaml.parser; import dyaml.reader; import dyaml.resolver; import dyaml.scanner; -import dyaml.tagdirectives; import dyaml.token; @@ -114,6 +112,8 @@ struct Loader Constructor constructor_; ///Name of the input file or stream, used in error messages. string name_ = ""; + ///Are we done loading? + bool done_ = false; public: @disable this(); @@ -127,7 +127,7 @@ struct Loader * * Throws: YAMLException if the file could not be opened or read. */ - this(in string filename) + this(string filename) { name_ = filename; try{this(new File(filename));} @@ -154,8 +154,6 @@ struct Loader parser_ = new Parser(scanner_); resolver_ = defaultResolver_; constructor_ = defaultConstructor_; - Anchor.addReference(); - TagDirectives.addReference(); } catch(YAMLException e) { @@ -167,8 +165,6 @@ struct Loader ///Destroy the Loader. ~this() { - Anchor.removeReference(); - TagDirectives.removeReference(); clear(reader_); clear(scanner_); clear(parser_); @@ -196,6 +192,8 @@ struct Loader * Load single YAML document. * * If none or more than one YAML document is found, this throws a YAMLException. + * + * This can only be called once; this is enforced by contract. * * Returns: Root node of the document. * @@ -203,9 +201,15 @@ struct Loader * or on a YAML parsing error. */ Node load() + in + { + assert(!done_, "Loader: Trying to load YAML twice"); + } + body { try { + scope(exit){done_ = true;} auto composer = new Composer(parser_, resolver_, constructor_); enforce(composer.checkNode(), new YAMLException("No YAML document to load")); return composer.getSingleNode(); @@ -224,6 +228,8 @@ struct Loader * them all at once. Calling loadAll after iterating over the node or * vice versa will not return any documents, as they have all been parsed * already. + * + * This can only be called once; this is enforced by contract. * * Returns: Array of root nodes of all documents in the file/stream. * @@ -241,10 +247,18 @@ struct Loader * * Parses documents lazily, when they are needed. * + * Foreach over a Loader can only be used once; this is enforced by contract. + * * Throws: YAMLException on a parsing error. */ int opApply(int delegate(ref Node) dg) + in { + assert(!done_, "Loader: Trying to load YAML twice"); + } + body + { + scope(exit){done_ = true;} try { auto composer = new Composer(parser_, resolver_, constructor_); @@ -284,11 +298,11 @@ struct Loader } //Parse and return all events. Used for debugging. - Event[] parse() + immutable(Event)[] parse() { try { - Event[] result; + immutable(Event)[] result; while(parser_.checkEvent()){result ~= parser_.getEvent();} return result; } diff --git a/dyaml/parser.d b/dyaml/parser.d index 7ec075d..d0d1909 100644 --- a/dyaml/parser.d +++ b/dyaml/parser.d @@ -24,7 +24,7 @@ import dyaml.scanner; import dyaml.style; import dyaml.token; import dyaml.tag; -import dyaml.tagdirectives; +import dyaml.tagdirective; package: @@ -108,10 +108,10 @@ final class Parser { private: ///Default tag handle shortcuts and replacements. - static tagDirective[] defaultTagDirectives_; + static TagDirective[] defaultTagDirectives_; static this() { - defaultTagDirectives_ = [tagDirective("!", "!"), tagDirective("!!", "tag:yaml.org,2002:")]; + defaultTagDirectives_ = [TagDirective("!", "!"), TagDirective("!!", "tag:yaml.org,2002:")]; } ///Scanner providing YAML tokens. @@ -123,7 +123,7 @@ final class Parser ///YAML version string. string YAMLVersion_ = null; ///Tag handle shortcuts and replacements. - tagDirective[] tagDirectives_; + TagDirective[] tagDirectives_; ///Stack of states. Array!(Event delegate()) states_; @@ -193,13 +193,13 @@ final class Parser * * Must not be called if there are no events left. */ - Event peekEvent() + immutable(Event) peekEvent() { if(currentEvent_.isNull && state_ !is null) { currentEvent_ = state_(); } - if(!currentEvent_.isNull){return currentEvent_;} + if(!currentEvent_.isNull){return cast(immutable Event)currentEvent_;} assert(false, "No event left to peek"); } @@ -208,7 +208,7 @@ final class Parser * * Must not be called if there are no events left. */ - Event getEvent() + immutable(Event) getEvent() { //Get the next event and proceed further. if(currentEvent_.isNull && state_ !is null) @@ -218,7 +218,7 @@ final class Parser if(!currentEvent_.isNull) { - immutable Event result = currentEvent_; + immutable Event result = cast(immutable Event)currentEvent_; currentEvent_.id = EventID.Invalid; return result; } @@ -273,7 +273,7 @@ final class Parser states_ ~= &parseDocumentEnd; state_ = &parseBlockNode; - return documentStartEvent(token.startMark, token.endMark, false, null, TagDirectives()); + return documentStartEvent(token.startMark, token.endMark, false, null, null); } return parseDocumentStart(); } @@ -336,7 +336,7 @@ final class Parser } ///Process directives at the beginning of a document. - TagDirectives processDirectives() + TagDirective[] processDirectives() { //Destroy version and tag handles from previous document. YAMLVersion_ = null; @@ -367,21 +367,21 @@ final class Parser foreach(ref pair; tagDirectives_) { //handle - const h = pair[0]; + const h = pair.handle; enforce(h != handle, new Error("Duplicate tag handle: " ~ handle, token.startMark)); } - tagDirectives_ ~= tagDirective(handle, parts[2]); + tagDirectives_ ~= TagDirective(handle, parts[2]); } } - TagDirectives value = tagDirectives_.length == 0 ? TagDirectives() : TagDirectives(tagDirectives_); + TagDirective[] value = tagDirectives_; //Add any default tag handles that haven't been overridden. foreach(ref defaultPair; defaultTagDirectives_) { bool found = false; - foreach(ref pair; tagDirectives_) if(defaultPair[0] == pair[0]) + foreach(ref pair; tagDirectives_) if(defaultPair.handle == pair.handle) { found = true; break; @@ -540,10 +540,9 @@ final class Parser string replacement = null; foreach(ref pair; tagDirectives_) { - //pair[0] is handle, pair[1] replacement. - if(pair[0] == handle) + if(pair.handle == handle) { - replacement = pair[1]; + replacement = pair.prefix; break; } } diff --git a/dyaml/queue.d b/dyaml/queue.d index 159ad8b..ca16cf8 100644 --- a/dyaml/queue.d +++ b/dyaml/queue.d @@ -37,7 +37,7 @@ struct Queue(T) ///Linked list node containing one element and pointer to the next node. struct Node { - T payload_ = T.init; + T payload_; Node* next_ = null; } @@ -90,7 +90,7 @@ struct Queue(T) } ///Push new item to the queue. - void push(in T item) + void push(T item) { Node* newLast = allocate!Node(item, cast(Node*)null); if(last_ !is null){last_.next_ = newLast;} @@ -100,7 +100,7 @@ struct Queue(T) } ///Insert a new item putting it to specified index in the linked list. - void insert(in T item, in size_t idx) + void insert(T item, in size_t idx) in { assert(idx <= length_); @@ -155,7 +155,7 @@ struct Queue(T) } ///Return the next element in the queue. - ref const(T) peek() const + ref inout(T) peek() inout in { assert(!empty, "Trying to peek at an element in an empty queue"); diff --git a/dyaml/reader.d b/dyaml/reader.d index 1a5d471..21fb407 100644 --- a/dyaml/reader.d +++ b/dyaml/reader.d @@ -7,7 +7,9 @@ module dyaml.reader; +import core.stdc.stdlib; import core.stdc.string; +import core.thread; import std.algorithm; import std.conv; @@ -34,47 +36,30 @@ class ReaderException : YAMLException } } -///Reads data from a stream and converts it to UTF-32 (dchar) data. +///Lazily reads and decodes data from stream, only storing as much as needed at any moment. final class Reader { private: - ///Input stream. + //Input stream. EndianStream stream_; - ///Allocated space for buffer_. - dchar[] bufferAllocated_; - ///Buffer of currently loaded characters. - dchar[] buffer_; - ///Current position within buffer. Only data after this position can be read. + //Allocated space for buffer_. + dchar[] bufferAllocated_ = null; + //Buffer of currently loaded characters. + dchar[] buffer_ = null; + //Current position within buffer. Only data after this position can be read. uint bufferOffset_ = 0; - ///Index of the current character in the stream. + //Index of the current character in the stream. size_t charIndex_ = 0; - ///Encoding of the input stream. - Encoding encoding_; - ///Current line in file. + //Current line in file. uint line_; - ///Current column in file. + //Current column in file. uint column_; - ///Number of bytes still available (not read) in the stream. - size_t available_; - - ///Capacity of raw buffers. - static immutable bufferLength8_ = 8; - ///Capacity of raw buffers. - static immutable bufferLength16_ = bufferLength8_ / 2; - - union - { - ///Buffer to hold UTF-8 data before decoding. - char[bufferLength8_ + 1] rawBuffer8_; - ///Buffer to hold UTF-16 data before decoding. - wchar[bufferLength16_ + 1] rawBuffer16_; - } - ///Number of elements held in the used raw buffer. - uint rawUsed_ = 0; + //Decoder reading data from file and decoding it to UTF-32. + UTFFastDecoder decoder_; public: - /** - * Construct a Reader. + /* + * Construct an AbstractReader. * * Params: stream = Input stream. Must be readable and seekable. * @@ -89,51 +74,14 @@ final class Reader body { stream_ = new EndianStream(stream); - available_ = stream_.available; - - //handle files short enough not to have a BOM - if(available_ < 2) - { - encoding_ = Encoding.UTF_8; - return; - } - - //readBOM will determine and set stream endianness - switch(stream_.readBOM(2)) - { - case -1: - //readBOM() eats two more bytes in this case so get them back - const wchar bytes = stream_.getcw(); - rawBuffer8_[0] = cast(char)(bytes % 256); - rawBuffer8_[1] = cast(char)(bytes / 256); - rawUsed_ = 2; - goto case 0; - case 0: encoding_ = Encoding.UTF_8; break; - case 1, 2: - //readBOM() eats two more bytes in this case so get them back - encoding_ = Encoding.UTF_16; - rawBuffer16_[0] = stream_.getcw(); - rawUsed_ = 1; - enforce(available_ % 2 == 0, - new ReaderException("Odd byte count in an UTF-16 stream")); - break; - case 3, 4: - enforce(available_ % 4 == 0, - new ReaderException("Byte count in an UTF-32 stream not divisible by 4")); - encoding_ = Encoding.UTF_32; - break; - default: assert(false, "Unknown UTF BOM"); - } - available_ = stream_.available; - - auto ptr = cast(dchar*)core.stdc.stdlib.malloc(dchar.sizeof * 256); - bufferAllocated_ = ptr[0 .. 256]; + decoder_ = UTFFastDecoder(stream_); } - ///Destroy the Reader. ~this() { - core.stdc.stdlib.free(bufferAllocated_.ptr); + //Delete the buffer, if allocated. + if(bufferAllocated_ is null){return;} + free(bufferAllocated_.ptr); buffer_ = bufferAllocated_ = null; } @@ -148,12 +96,13 @@ final class Reader * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ - dchar peek(in size_t index = 0) + dchar peek(size_t index = 0) { - if(buffer_.length <= bufferOffset_ + index + 1) + if(buffer_.length < bufferOffset_ + index + 1) { updateBuffer(index + 1); } + if(buffer_.length <= bufferOffset_ + index) { throw new ReaderException("Trying to read past the end of the stream"); @@ -172,7 +121,7 @@ final class Reader * * Returns: Characters starting at current position or an empty slice if out of bounds. */ - const(dstring) prefix(in size_t length) + const(dstring) prefix(size_t length) { return slice(0, length); } @@ -194,12 +143,12 @@ final class Reader { updateBuffer(end); } + end += bufferOffset_; start += bufferOffset_; end = min(buffer_.length, end); - if(end <= start){return "";} - return cast(dstring)buffer_[start .. end]; + return end > start ? cast(dstring)buffer_[start .. end] : ""; } /** @@ -227,7 +176,7 @@ final class Reader * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ - dstring get(in size_t length) + dstring get(size_t length) { auto result = prefix(length).dup; forward(length); @@ -244,13 +193,13 @@ final class Reader */ void forward(size_t length = 1) { - mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; - if(buffer_.length <= bufferOffset_ + length + 1) { updateBuffer(length + 1); } + mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; + while(length > 0) { const c = buffer_[bufferOffset_]; @@ -268,19 +217,19 @@ final class Reader } ///Get a string describing current stream position, used for error messages. - @property Mark mark() const {return Mark(line_, column_);} + @property final Mark mark() const {return Mark(line_, column_);} ///Get current line number. - @property uint line() const {return line_;} + @property final uint line() const {return line_;} - ///Get current line number. - @property uint column() const {return column_;} + ///Get current column number. + @property final uint column() const {return column_;} ///Get index of the current character in the stream. - @property size_t charIndex() const {return charIndex_;} + @property final size_t charIndex() const {return charIndex_;} ///Get encoding of the input stream. - @property Encoding encoding() const {return encoding_;} + @property final Encoding encoding() const {return decoder_.encoding;} private: /** @@ -296,7 +245,7 @@ final class Reader */ void updateBuffer(in size_t length) { - //get rid of unneeded data in the buffer + //Get rid of unneeded data in the buffer. if(bufferOffset_ > 0) { size_t bufferLength = buffer_.length - bufferOffset_; @@ -306,12 +255,12 @@ final class Reader bufferOffset_ = 0; } - ////Load chars in batches of at most 1024 bytes (256 chars) + //Load chars in batches of at most 1024 bytes (256 chars) while(buffer_.length <= bufferOffset_ + length) { - loadChars(256); + loadChars(512); - if(done) + if(decoder_.done) { if(buffer_.length == 0 || buffer_[$ - 1] != '\0') { @@ -325,9 +274,11 @@ final class Reader } /** - * Load at most specified number of characters. + * Load more characters to the buffer. * - * Params: chars = Maximum number of characters to load. + * Params: chars = Recommended number of characters to load. + * More characters might be loaded. + * Less will be loaded if not enough available. * * Throws: ReaderException on Unicode decoding error, * if nonprintable characters are detected, or @@ -335,96 +286,35 @@ final class Reader */ void loadChars(size_t chars) { - ///Get next character from the stream. - dchar getDChar() - { - final switch(encoding_) - { - case Encoding.UTF_8: - //Temp buffer for moving data in rawBuffer8_. - char[bufferLength8_] temp; - //Shortcut for ASCII. - if(rawUsed_ > 0 && rawBuffer8_[0] < 128) - { - //Get the first byte (one char in ASCII). - const dchar result = rawBuffer8_[0]; - --rawUsed_; - //Move the data. - *(cast(ulong*)temp.ptr) = *(cast(ulong*)(rawBuffer8_.ptr + 1)); - *(cast(ulong*)rawBuffer8_.ptr) = *(cast(ulong*)temp.ptr); - return result; - } - - //Bytes to read. - const readBytes = min(available_, bufferLength8_ - rawUsed_); - available_ -= readBytes; - //Length of data in rawBuffer8_ after reading. - const len = rawUsed_ + readBytes; - //Read the data. - stream_.readExact(rawBuffer8_.ptr + rawUsed_, readBytes); - - //After decoding, this will point to the first byte not decoded. - size_t idx = 0; - const dchar result = decode(rawBuffer8_, idx); - rawUsed_ = cast(uint)(len - idx); - - //Move the data. - temp[0 .. rawUsed_] = rawBuffer8_[idx .. len]; - rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_]; - return result; - case Encoding.UTF_16: - //Temp buffer for moving data in rawBuffer8_. - wchar[bufferLength16_] temp; - //Words to read. - size_t readWords = min(available_ / 2, bufferLength16_ - rawUsed_); - available_ -= readWords * 2; - //Length of data in rawBuffer16_ after reading. - size_t len = rawUsed_; - //Read the data. - while(readWords > 0) - { - //Due to a bug in std.stream, we have to use getcw here. - rawBuffer16_[len] = stream_.getcw(); - --readWords; - ++len; - } - - //After decoding, this will point to the first word not decoded. - size_t idx = 0; - const dchar result = decode(rawBuffer16_, idx); - rawUsed_ = cast(uint)(len - idx); - - //Move the data. - temp[0 .. rawUsed_] = rawBuffer16_[idx .. len]; - rawBuffer16_[0 .. rawUsed_] = temp[0 .. rawUsed_]; - return result; - case Encoding.UTF_32: - dchar result; - available_ -= 4; - stream_.read(result); - return result; - } - } - const oldLength = buffer_.length; const oldPosition = stream_.position; - //Preallocating memory to limit GC reallocations. - bufferReserve(buffer_.length + chars); buffer_ = bufferAllocated_[0 .. buffer_.length + chars]; - scope(exit) + scope(success) { buffer_ = buffer_[0 .. $ - chars]; enforce(printable(buffer_[oldLength .. $]), new ReaderException("Special unicode characters are not allowed")); } - try for(uint c = 0; chars; --chars, ++c) + try for(size_t c = 0; chars && !decoder_.done;) { - if(done){break;} - buffer_[oldLength + c] = getDChar(); + const slice = decoder_.getDChars(chars); + buffer_[oldLength + c .. oldLength + c + slice.length] = slice; + c += slice.length; + chars -= slice.length; } + catch(Exception e) + { + handleLoadCharsException(e, oldPosition); + } + } + + //Handle an exception thrown in loadChars method of any Reader. + void handleLoadCharsException(Exception e, size_t oldPosition) + { + try{throw e;} catch(UtfException e) { const position = stream_.position; @@ -437,94 +327,376 @@ final class Reader } } - /** - * Determine if all characters in an array are printable. - * - * Params: chars = Characters to check. - * - * Returns: True if all the characters are printable, false otherwise. - */ - static bool printable(const ref dchar[] chars) pure + //Code shared by loadEntireFile methods. + void loadEntireFile_() { - foreach(c; chars) + const maxChars = decoder_.maxChars; + bufferReserve(maxChars + 1); + loadChars(maxChars); + + if(buffer_.length == 0 || buffer_[$ - 1] != '\0') { - if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) || - (c >= 0x20 && c <= 0x7E) || - (c >= 0xA0 && c <= '\uD7FF') || - (c >= '\uE000' && c <= '\uFFFD'))) - { - return false; - } + buffer_ = bufferAllocated_[0 .. buffer_.length + 1]; + buffer_[$ - 1] = '\0'; } - return true; } - ///Are we done reading? - @property bool done() const - { - return (available_ == 0 && - ((encoding_ == Encoding.UTF_8 && rawUsed_ == 0) || - (encoding_ == Encoding.UTF_16 && rawUsed_ == 0) || - encoding_ == Encoding.UTF_32)); - } - - ///Ensure there is space for at least capacity characters in bufferAllocated_. + //Ensure there is space for at least capacity characters in bufferAllocated_. void bufferReserve(in size_t capacity) { - if(bufferAllocated_.length >= capacity){return;} + if(bufferAllocated_ !is null && bufferAllocated_.length >= capacity){return;} - auto newPtr = core.stdc.stdlib.realloc(bufferAllocated_.ptr, - capacity * dchar.sizeof); - bufferAllocated_ = (cast(dchar*)newPtr)[0 .. capacity]; + //Handle first allocation as well as reallocation. + auto ptr = bufferAllocated_ !is null + ? realloc(bufferAllocated_.ptr, capacity * dchar.sizeof) + : malloc(capacity * dchar.sizeof); + bufferAllocated_ = (cast(dchar*)ptr)[0 .. capacity]; buffer_ = bufferAllocated_[0 .. buffer_.length]; } +} - unittest - { - writeln("D:YAML reader endian unittest"); - void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) +private: + +alias UTFBlockDecoder!512 UTFFastDecoder; + +///Decodes streams to UTF-32 in blocks. +struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0) +{ + private: + //UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence). + static immutable ubyte[256] utf8Stride = + [ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, + ]; + + //Encoding of the input stream. + Encoding encoding_; + //Maximum number of characters that might be in the stream. + size_t maxChars_; + //Bytes available in the stream. + size_t available_; + //Input stream. + EndianStream stream_; + + //Buffer used to store raw UTF-8 or UTF-16 code points. + union { - auto reader = new Reader(new MemoryStream(data)); - assert(reader.encoding_ == encoding_expected); - assert(reader.stream_.endian == endian_expected); + char[bufferSize_] rawBuffer8_; + wchar[bufferSize_ / 2] rawBuffer16_; } - ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; - ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; - endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); - endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); - } - unittest + //Used space (in items) in rawBuffer8_/rawBuffer16_. + size_t rawUsed_; + + //Space used by buffer_. + dchar[bufferSize_] bufferSpace_; + //Buffer of decoded, UTF-32 characters. This is a slice into bufferSpace_. + dchar[] buffer_; + + public: + ///Construct a UTFFastDecoder decoding a stream. + this(EndianStream stream) + { + stream_ = stream; + available_ = stream_.available; + + //Handle files short enough not to have a BOM. + if(available_ < 2) + { + encoding_ = Encoding.UTF_8; + maxChars_ = 0; + + if(available_ == 1) + { + bufferSpace_[0] = stream_.getc(); + buffer_ = bufferSpace_[0 .. 1]; + maxChars_ = 1; + } + return; + } + + char[] rawBuffer8; + wchar[] rawBuffer16; + //readBOM will determine and set stream endianness. + switch(stream_.readBOM(2)) + { + case -1: + //readBOM() eats two more bytes in this case so get them back. + const wchar bytes = stream_.getcw(); + rawBuffer8_[0 .. 2] = [cast(ubyte)(bytes % 256), cast(ubyte)(bytes / 256)]; + rawUsed_ = 2; + goto case 0; + case 0: + maxChars_ = available_; + encoding_ = Encoding.UTF_8; + break; + case 1, 2: + maxChars_ = available_ / 2; + //readBOM() eats two more bytes in this case so get them back. + encoding_ = Encoding.UTF_16; + rawBuffer16_[0] = stream_.getcw(); + rawUsed_ = 1; + enforce(available_ % 2 == 0, + new ReaderException("Odd byte count in an UTF-16 stream")); + break; + case 3, 4: + maxChars_ = available_ / 4; + encoding_ = Encoding.UTF_32; + enforce(available_ % 4 == 0, + new ReaderException("Byte count in an UTF-32 stream not divisible by 4")); + break; + default: assert(false, "Unknown UTF BOM"); + } + available_ = stream_.available; + } + + ///Get maximum number of characters that might be in the stream. + @property size_t maxChars() const {return maxChars_;} + + ///Get encoding we're decoding from. + @property Encoding encoding() const {return encoding_;} + + ///Are we done decoding? + @property bool done() const + { + return rawUsed_ == 0 && buffer_.length == 0 && available_ == 0; + } + + ///Get next character. + dchar getDChar() + { + if(buffer_.length) + { + const result = buffer_[0]; + buffer_ = buffer_[1 .. $]; + return result; + } + + assert(available_ > 0 || rawUsed_ > 0); + updateBuffer(); + return getDChar(); + } + + ///Get as many characters as possible, but at most maxChars. Slice returned will be invalidated in further calls. + const(dchar[]) getDChars(size_t maxChars = size_t.max) + { + if(buffer_.length) + { + const slice = min(buffer_.length, maxChars); + const result = buffer_[0 .. slice]; + buffer_ = buffer_[slice .. $]; + return result; + } + + assert(available_ > 0 || rawUsed_ > 0); + updateBuffer(); + return getDChars(maxChars); + } + + private: + //Read and decode characters from file and store them in the buffer. + void updateBuffer() + { + assert(buffer_.length == 0); + final switch(encoding_) + { + case Encoding.UTF_8: + const bytes = min(bufferSize_ - rawUsed_, available_); + //Current length of valid data in rawBuffer8_. + const rawLength = rawUsed_ + bytes; + stream_.readExact(rawBuffer8_.ptr + rawUsed_, bytes); + available_ -= bytes; + decodeRawBuffer(rawBuffer8_, rawLength); + break; + + case Encoding.UTF_16: + const words = min((bufferSize_ / 2) - rawUsed_, available_ / 2); + //Current length of valid data in rawBuffer16_. + const rawLength = rawUsed_ + words; + foreach(c; rawUsed_ .. rawLength) + { + stream_.read(rawBuffer16_[c]); + available_ -= 2; + } + decodeRawBuffer(rawBuffer16_, rawLength); + break; + + case Encoding.UTF_32: + const chars = min(bufferSize_ / 4, available_ / 4); + foreach(c; 0 .. chars) + { + stream_.read(bufferSpace_[c]); + available_ -= 4; + } + buffer_ = bufferSpace_[0 .. chars]; + break; + } + } + + //Decode contents of a UTF-8 or UTF-16 raw buffer. + void decodeRawBuffer(C)(C[] buffer, const size_t length) + { + //End of part of rawBuffer8_ that contains + //complete characters and can be decoded. + const end = endOfLastUTFSequence(buffer, length); + //If end is 0, there are no full UTF-8 chars. + //This can happen at the end of file if there is an incomplete UTF-8 sequence. + enforce(end > 0, + new ReaderException("Invalid UTF-8 character at the end of stream")); + + decodeUTF(buffer[0 .. end]); + + //After decoding, any code points not decoded go to the start of raw buffer. + rawUsed_ = length - end; + foreach(i; 0 .. rawUsed_){buffer[i] = buffer[i + end];} + } + + //Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer. + size_t endOfLastUTFSequence(C)(const C[] buffer, const size_t max) + { + static if(is(C == char)) + { + for(long end = max - 1; end >= 0; --end) + { + const s = utf8Stride[buffer[end]]; + if(s != 0xFF) + { + //If stride goes beyond end of the buffer (max), return end. + //Otherwise the last sequence ends at max, so we can return that. + //(Unless there is an invalid code point, which is + //caught at decoding) + return (s > max - end) ? cast(size_t)end : max; + } + } + return 0; + } + else + { + size_t end = 0; + while(end < max) + { + const s = stride(buffer, end); + if(s + end > max){break;} + end += s; + } + return end; + } + } + + //Decode a UTF-8 or UTF-16 buffer (with no incomplete sequences at the end). + void decodeUTF(C)(const C[] source) + { + size_t bufpos = 0; + const srclength = source.length; + for(size_t srcpos = 0; srcpos < srclength;) + { + const c = source[srcpos]; + if(c < 0x80) + { + bufferSpace_[bufpos++] = c; + ++srcpos; + } + else + { + bufferSpace_[bufpos++] = decode(source, srcpos); + } + } + buffer_ = bufferSpace_[0 .. bufpos]; + } +} + +/** + * Determine if all characters in an array are printable. + * + * Params: chars = Characters to check. + * + * Returns: True if all the characters are printable, false otherwise. + */ +bool printable(const ref dchar[] chars) pure +{ + foreach(c; chars) { - writeln("D:YAML reader peek/prefix/forward unittest"); - ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data"; - auto reader = new Reader(new MemoryStream(data)); + if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) || + (c >= 0x20 && c <= 0x7E) || + (c >= 0xA0 && c <= '\uD7FF') || + (c >= '\uE000' && c <= '\uFFFD'))) + { + return false; + } + } + return true; +} + +//Unittests. + +void testEndian(R)() +{ + writeln(typeid(R).toString() ~ ": endian unittest"); + void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) + { + Reader reader = new R(new MemoryStream(data)); + assert(reader.encoding == encoding_expected); + assert(reader.stream_.endian == endian_expected); + } + ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; + ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; + endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); + endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); +} + +void testPeekPrefixForward(R)() +{ + writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest"); + ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data"; + Reader reader = new R(new MemoryStream(data)); + assert(reader.peek() == 'd'); + assert(reader.peek(1) == 'a'); + assert(reader.peek(2) == 't'); + assert(reader.peek(3) == 'a'); + assert(reader.peek(4) == '\0'); + assert(reader.prefix(4) == "data"); + assert(reader.prefix(6) == "data\0"); + reader.forward(2); + assert(reader.peek(1) == 'a'); + assert(collectException(reader.peek(3))); +} + +void testUTF(R)() +{ + writeln(typeid(R).toString() ~ ": UTF formats unittest"); + dchar[] data = cast(dchar[])"data"; + void utf_test(T)(T[] data, BOM bom) + { + ubyte[] bytes = ByteOrderMarks[bom] ~ + (cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof]; + Reader reader = new R(new MemoryStream(bytes)); assert(reader.peek() == 'd'); assert(reader.peek(1) == 'a'); assert(reader.peek(2) == 't'); assert(reader.peek(3) == 'a'); - assert(reader.peek(4) == '\0'); - assert(reader.prefix(4) == "data"); - assert(reader.prefix(6) == "data\0"); - reader.forward(2); - assert(reader.peek(1) == 'a'); - assert(collectException(reader.peek(3))); - } - unittest - { - writeln("D:YAML reader UTF formats unittest"); - dchar[] data = cast(dchar[])"data"; - void utf_test(T)(T[] data, BOM bom) - { - ubyte[] bytes = ByteOrderMarks[bom] ~ - (cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof]; - auto reader = new Reader(new MemoryStream(bytes)); - assert(reader.peek() == 'd'); - assert(reader.peek(1) == 'a'); - assert(reader.peek(2) == 't'); - assert(reader.peek(3) == 'a'); - } - utf_test!char(to!(char[])(data), BOM.UTF8); - utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE); - utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE); } + utf_test!char(to!(char[])(data), BOM.UTF8); + utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE); + utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE); +} + +unittest +{ + testEndian!Reader(); + testPeekPrefixForward!Reader(); + testUTF!Reader(); } diff --git a/dyaml/representer.d b/dyaml/representer.d index 0293dc1..01920ab 100644 --- a/dyaml/representer.d +++ b/dyaml/representer.d @@ -65,7 +65,7 @@ final class Representer * disabled to use custom representer * functions for default types. */ - this(in bool useDefaultRepresenters = true) + this(bool useDefaultRepresenters = true) { if(!useDefaultRepresenters){return;} addRepresenter!YAMLNull(&representNull); @@ -87,13 +87,13 @@ final class Representer } ///Set default _style for scalars. Invalid means the _style is chosen automatically. - @property void defaultScalarStyle(in ScalarStyle style) + @property void defaultScalarStyle(ScalarStyle style) { defaultScalarStyle_ = style; } ///Set default _style for collections. Invalid means the _style is chosen automatically. - @property void defaultCollectionStyle(in CollectionStyle style) + @property void defaultCollectionStyle(CollectionStyle style) { defaultCollectionStyle_ = style; } @@ -237,7 +237,7 @@ final class Representer * } * -------------------- */ - Node representScalar(in string tag, string scalar, + Node representScalar(string tag, string scalar, ScalarStyle style = ScalarStyle.Invalid) { if(style == ScalarStyle.Invalid){style = defaultScalarStyle_;} @@ -276,7 +276,7 @@ final class Representer * } * -------------------- */ - Node representSequence(in string tag, Node[] sequence, + Node representSequence(string tag, Node[] sequence, CollectionStyle style = CollectionStyle.Invalid) { Node[] value; @@ -335,7 +335,7 @@ final class Representer * } * -------------------- */ - Node representMapping(in string tag, Node.Pair[] pairs, + Node representMapping(string tag, Node.Pair[] pairs, CollectionStyle style = CollectionStyle.Invalid) { Node.Pair[] value; diff --git a/dyaml/serializer.d b/dyaml/serializer.d index 6c84257..ff54cc3 100644 --- a/dyaml/serializer.d +++ b/dyaml/serializer.d @@ -23,7 +23,7 @@ import dyaml.exception; import dyaml.node; import dyaml.resolver; import dyaml.tag; -import dyaml.tagdirectives; +import dyaml.tagdirective; import dyaml.token; @@ -46,7 +46,7 @@ struct Serializer string YAMLVersion_; ///Tag directives to emit. - TagDirectives tagDirectives_; + TagDirective[] tagDirectives_; //TODO Use something with more deterministic memory usage. ///Nodes with assigned anchors. @@ -70,7 +70,7 @@ struct Serializer */ this(ref Emitter emitter, Resolver resolver, Encoding encoding, in bool explicitStart, in bool explicitEnd, string YAMLVersion, - TagDirectives tagDirectives) + TagDirective[] tagDirectives) { emitter_ = &emitter; resolver_ = resolver; diff --git a/dyaml/sharedobject.d b/dyaml/sharedobject.d deleted file mode 100644 index dd93018..0000000 --- a/dyaml/sharedobject.d +++ /dev/null @@ -1,131 +0,0 @@ - -// Copyright Ferdinand Majerech 2011. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -///Shared object. -module dyaml.sharedobject; - - -/** - * Mixin for shared objects (need a better name). - * - * This works as an index to a static array of type T. Any new object created is - * checked for presence in the array to prevent duplication. - * - * This is useful for e.g. token/event data that rarely needs to be - * stored (e.g. tag directives) to prevent inflation of these structs, - * and when there are many instances of a data type that are mostly - * duplicates (e.g. tags). - * - * This is not the most elegant way to store the extra data and might change in future. - */ -template SharedObject(T, MixedIn) -{ - private: - ///This class stores the data that is shared between the objects. - class SharedData - { - private: - /** - * Reference count. - * - * When this reaches zero, objects_ are cleared. This is not - * the number of shared objects, but rather of objects using this kind - * of shared object. - */ - int referenceCount_ = 0; - - ///All known objects of type T are in this array. - T[] objects_; - - public: - ///Increment the reference count. - void addReference() - { - assert(referenceCount_ >= 0); - ++referenceCount_; - } - - ///Decrement the reference count and clear the constructed objects if zero. - void removeReference() - { - --referenceCount_; - assert(referenceCount_ >= 0); - if(referenceCount_ == 0) - { - clear(objects_); - objects_ = []; - } - } - - ///Add an object and return its index. - uint add(ref T object) - { - foreach(index, ref known; objects_) if(object == known) - { - return cast(uint)index; - } - objects_ ~= object; - return cast(uint)objects_.length - 1; - } - - ///Get the object at specified object. - @property T get(in uint index) - { - return objects_[index]; - } - } - - ///Index of the object in data_. - uint index_ = uint.max; - - ///Stores the actual objects. - static __gshared SharedData data_; - - static this() - { - data_ = new SharedData; - } - - public: - ///Increment the reference count. - static void addReference() - { - synchronized(data_){data_.addReference();} - } - - ///Decrement the reference count and clear the constructed objects if zero. - static void removeReference() - { - synchronized(data_){data_.removeReference();} - } - - ///Get the object. - @property T get() const - in{assert(!isNull());} - body - { - T result; - synchronized(data_){result = data_.get(index_);} - return result; - } - - ///Test for equality with another object. - bool opEquals(const ref MixedIn object) const - { - return object.index_ == index_; - } - - ///Is this object null (invalid)? - @property bool isNull() const {return index_ == uint.max;} - - private: - ///Add a new object, checking if identical object already exists. - void add(ref T object) - { - synchronized(data_){index_ = data_.add(object);} - } -} - diff --git a/dyaml/tag.d b/dyaml/tag.d index fb06120..467f385 100644 --- a/dyaml/tag.d +++ b/dyaml/tag.d @@ -7,66 +7,7 @@ ///YAML tag. module dyaml.tag; +import dyaml.zerostring; -import core.stdc.string; - - -///YAML tag (data type) struct. Encapsulates a tag to save memory and speed-up comparison. -struct Tag -{ - private: - ///Zero terminated tag string. - immutable(char)* tag_ = null; - - public: - @disable int opCmp(ref Tag); - - - ///Construct a tag from a string representation. - this(in string tag) - { - if(tag is null || tag == "") - { - tag_ = null; - return; - } - - tag_ = (tag ~ '\0').ptr; - } - - ///Get the tag string. - @property string get() const - in{assert(!isNull());} - body - { - return cast(string)tag_[0 .. strlen(tag_)]; - } - - ///Test for equality with another tag. - bool opEquals(const ref Tag tag) const - { - return isNull ? tag.isNull : - tag.isNull ? false : (0 == strcmp(tag_, tag.tag_)); - } - - ///Compute a hash. - hash_t toHash() const - in{assert(!isNull);} - body - { - static type = typeid(string); - auto str = get(); - return type.getHash(&str); - } - - ///Compare with another tag. - int opCmp(const ref Tag tag) const - in{assert(!isNull && !tag.isNull);} - body - { - return strcmp(tag_, tag.tag_); - } - - ///Is this tag null (invalid)? - @property bool isNull() const {return tag_ is null;} -} +///YAML tag (data type) struct. Encapsulates a tag to save memory and speed up comparison. +alias ZeroString!"Tag" Tag; diff --git a/dyaml/tagdirective.d b/dyaml/tagdirective.d new file mode 100644 index 0000000..54687fe --- /dev/null +++ b/dyaml/tagdirective.d @@ -0,0 +1,15 @@ + +// Copyright Ferdinand Majerech 2011. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +///Tag directives. +module dyaml.tagdirective; + +///Single tag directive. handle is the shortcut, prefix is the prefix that replaces it. +struct TagDirective +{ + string handle; + string prefix; +} diff --git a/dyaml/tagdirectives.d b/dyaml/tagdirectives.d deleted file mode 100644 index 5aa3e7c..0000000 --- a/dyaml/tagdirectives.d +++ /dev/null @@ -1,28 +0,0 @@ - -// Copyright Ferdinand Majerech 2011. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -///Tag directives. -module dyaml.tagdirectives; - -import std.typecons; - -import dyaml.sharedobject; - -///Single tag directive. handle is the shortcut, prefix is the prefix that replaces it. -alias Tuple!(string, "handle", string, "prefix") tagDirective; - -///Tag directives stored in Event. -struct TagDirectives -{ - public: - mixin SharedObject!(tagDirective[], TagDirectives); - - ///Construct a tags object from an array of tag directives. - this(tagDirective[] tagDirectives) - { - add(tagDirectives); - } -} diff --git a/dyaml/zerostring.d b/dyaml/zerostring.d new file mode 100644 index 0000000..c3716cf --- /dev/null +++ b/dyaml/zerostring.d @@ -0,0 +1,73 @@ + +// Copyright Ferdinand Majerech 2011. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +///Zero terminated string. +module dyaml.zerostring; + +import core.stdc.string; + +/** + * Zero terminated string used to decrease data structure size. + * + * TypeName is used to differentiate types (better than simple alias). + */ +struct ZeroString(string TypeName) +{ + private: + ///Zero terminated string. + immutable(char)* str_ = null; + + public: + @disable int opCmp(ref ZeroString); + + ///Construct a string. + this(in string str) + { + if(str is null || str == "") + { + str_ = null; + return; + } + + str_ = (str ~ '\0').ptr; + } + + ///Get the string. + @property string get() const + in{assert(!isNull());} + body + { + return cast(string)str_[0 .. strlen(str_)]; + } + + ///Test for equality with another string. + bool opEquals(const ref ZeroString str) const + { + return isNull ? str.isNull : + str.isNull ? false : (0 == strcmp(str_, str.str_)); + } + + ///Compute a hash. + hash_t toHash() const + in{assert(!isNull);} + body + { + static type = typeid(string); + auto str = get(); + return type.getHash(&str); + } + + ///Compare with another string. + int opCmp(const ref ZeroString str) const + in{assert(!isNull && !str.isNull);} + body + { + return strcmp(str_, str.str_); + } + + ///Is this string null (invalid)? + @property bool isNull() const {return str_ is null;} +} diff --git a/test/src/compare.d b/test/src/compare.d index 4e69c13..c13b1c0 100644 --- a/test/src/compare.d +++ b/test/src/compare.d @@ -42,8 +42,8 @@ void testParser(bool verbose, string dataFilename, string canonicalFilename) */ void testLoader(bool verbose, string dataFilename, string canonicalFilename) { - auto data = Loader(dataFilename).loadAll; - auto canonical = Loader(canonicalFilename).loadAll; + auto data = Loader(dataFilename).loadAll(); + auto canonical = Loader(canonicalFilename).loadAll(); assert(data.length == canonical.length, "Unequal node count"); foreach(n; 0 .. data.length) @@ -58,7 +58,7 @@ void testLoader(bool verbose, string dataFilename, string canonicalFilename) writeln("Canonical value:"); writeln(canonical[n].debugString); } - assert(false); + assert(false, "testLoader(" ~ dataFilename ~ ", " ~ canonicalFilename ~ ") failed"); } } } diff --git a/test/src/emitter.d b/test/src/emitter.d index 87425a8..0a37db5 100644 --- a/test/src/emitter.d +++ b/test/src/emitter.d @@ -83,7 +83,7 @@ void testEmitterOnData(bool verbose, string dataFilename, string canonicalFilena { //Must exist due to Anchor, Tags reference counts. auto loader = Loader(dataFilename); - auto events = loader.parse(); + auto events = cast(Event[])loader.parse(); auto emitStream = new MemoryStream; Dumper(emitStream).emit(events); @@ -98,7 +98,7 @@ void testEmitterOnData(bool verbose, string dataFilename, string canonicalFilena loader2.name = "TEST"; loader2.constructor = new Constructor; loader2.resolver = new Resolver; - auto newEvents = loader2.parse(); + auto newEvents = cast(Event[])loader2.parse(); assert(compareEvents(events, newEvents)); } @@ -114,7 +114,7 @@ void testEmitterOnCanonical(bool verbose, string canonicalFilename) { //Must exist due to Anchor, Tags reference counts. auto loader = Loader(canonicalFilename); - auto events = loader.parse(); + auto events = cast(Event[])loader.parse(); foreach(canonical; [false, true]) { auto emitStream = new MemoryStream; @@ -130,7 +130,7 @@ void testEmitterOnCanonical(bool verbose, string canonicalFilename) loader2.name = "TEST"; loader2.constructor = new Constructor; loader2.resolver = new Resolver; - auto newEvents = loader2.parse(); + auto newEvents = cast(Event[])loader2.parse(); assert(compareEvents(events, newEvents)); } } @@ -151,7 +151,7 @@ void testEmitterStyles(bool verbose, string dataFilename, string canonicalFilena { //must exist due to Anchor, Tags reference counts auto loader = Loader(canonicalFilename); - auto events = loader.parse(); + auto events = cast(Event[])loader.parse(); foreach(flowStyle; [CollectionStyle.Block, CollectionStyle.Flow]) { foreach(style; [ScalarStyle.Literal, ScalarStyle.Folded, @@ -191,7 +191,7 @@ void testEmitterStyles(bool verbose, string dataFilename, string canonicalFilena loader2.name = "TEST"; loader2.constructor = new Constructor; loader2.resolver = new Resolver; - auto newEvents = loader2.parse(); + auto newEvents = cast(Event[])loader2.parse(); assert(compareEvents(events, newEvents)); } } diff --git a/test/src/errors.d b/test/src/errors.d index 5356099..44b98dc 100644 --- a/test/src/errors.d +++ b/test/src/errors.d @@ -72,7 +72,8 @@ void testLoaderErrorFilename(bool verbose, string errorFilename) if(verbose){writeln(typeid(e).toString(), "\n", e);} return; } - assert(false, "Expected an exception"); + assert(false, "testLoaderErrorSingle(" ~ to!string(verbose) ~ + ", " ~ errorFilename ~ ") Expected an exception"); } /**