dyaml/dyaml/parser.d
Ferdinand Majerech 283c42bf8f Initial commit.
2011-08-16 14:53:13 +02:00

843 lines
31 KiB
D

// Copyright Ferdinand Majerech 2011.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
/**
* YAML parser.
* Code based on PyYAML: http://www.pyyaml.org
*/
module dyaml.parser;
import std.array;
import std.conv;
import std.exception;
import dyaml.event;
import dyaml.scanner;
import dyaml.token;
import dyaml.exception;
package:
/**
* The following YAML grammar is LL(1) and is parsed by a recursive descent
* parser.
*
* stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
* implicit_document ::= block_node DOCUMENT-END*
* explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
* block_node_or_indentless_sequence ::=
* ALIAS
* | properties (block_content | indentless_block_sequence)?
* | block_content
* | indentless_block_sequence
* block_node ::= ALIAS
* | properties block_content?
* | block_content
* flow_node ::= ALIAS
* | properties flow_content?
* | flow_content
* properties ::= TAG ANCHOR? | ANCHOR TAG?
* block_content ::= block_collection | flow_collection | SCALAR
* flow_content ::= flow_collection | SCALAR
* block_collection ::= block_sequence | block_mapping
* flow_collection ::= flow_sequence | flow_mapping
* block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
* indentless_sequence ::= (BLOCK-ENTRY block_node?)+
* block_mapping ::= BLOCK-MAPPING_START
* ((KEY block_node_or_indentless_sequence?)?
* (VALUE block_node_or_indentless_sequence?)?)*
* BLOCK-END
* flow_sequence ::= FLOW-SEQUENCE-START
* (flow_sequence_entry FLOW-ENTRY)*
* flow_sequence_entry?
* FLOW-SEQUENCE-END
* flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
* flow_mapping ::= FLOW-MAPPING-START
* (flow_mapping_entry FLOW-ENTRY)*
* flow_mapping_entry?
* FLOW-MAPPING-END
* flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
*
* FIRST sets:
*
* stream: { STREAM-START }
* explicit_document: { DIRECTIVE DOCUMENT-START }
* implicit_document: FIRST(block_node)
* block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
* flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
* block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
* flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
* block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
* flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
* block_sequence: { BLOCK-SEQUENCE-START }
* block_mapping: { BLOCK-MAPPING-START }
* block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
* indentless_sequence: { ENTRY }
* flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
* flow_sequence: { FLOW-SEQUENCE-START }
* flow_mapping: { FLOW-MAPPING-START }
* flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
* flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
*/
/**
* Marked exception thrown at parser errors.
*
* See_Also: MarkedYAMLException
*/
class ParserException : MarkedYAMLException
{
this(string context, Mark contextMark, string problem, Mark problemMark)
{
super(context, contextMark, problem, problemMark);
}
this(string problem, Mark problemMark){super(problem, problemMark);}
}
///Generates events from tokens provided by a Scanner.
final class Parser
{
invariant()
{
assert(currentEvent_.length <= 1);
}
private:
///Default tag handle shortcuts and replacements.
static string[string] defaultTags_;
static this()
{
defaultTags_ = ["!" : "!", "!!" : "tag:yaml.org,2002:"];
}
///Scanner providing YAML tokens.
Scanner scanner_;
///Holds zero or one event.
Event[] currentEvent_;
///YAML version string.
string YAMLVersion_ = null;
///Tag handle shortcuts and replacements.
string[string] tagHandles_;
///Stack of states.
Event delegate()[] states_;
///Stack of marks used to keep track of extents of e.g. YAML collections.
Mark[] marks_;
///Current state.
Event delegate() state_;
public:
///Construct a Parser using specified Scanner.
this(Scanner scanner)
{
state_ = &parseStreamStart;
scanner_ = scanner;
}
///Destroy the parser.
~this()
{
clear(currentEvent_);
currentEvent_ = null;
clear(tagHandles_);
tagHandles_ = null;
clear(states_);
states_ = null;
clear(marks_);
marks_ = null;
}
/**
* Check if the next event is one of specified types.
*
* If no types are specified, checks if any events are left.
*
* Params: ids = Event IDs to check for.
*
* Returns: true if the next event is one of specified types,
* or if there are any events left if no types specified.
* false otherwise.
*/
bool checkEvent(EventID[] ids...)
{
//Check if the next event is one of specified types.
if(currentEvent_.empty && state_ !is null)
{
currentEvent_ ~= state_();
}
if(!currentEvent_.empty)
{
if(ids.length == 0){return true;}
else
{
const nextId = currentEvent_.front.id;
foreach(id; ids)
{
if(nextId == id){return true;}
}
}
}
return false;
}
/**
* Return the next event, but keep it in the queue.
*
* Must not be called if there are no events left.
*/
Event peekEvent()
{
if(currentEvent_.empty && state_ !is null)
{
currentEvent_ ~= state_();
}
if(!currentEvent_.empty){return currentEvent_[0];}
assert(false, "No event left to peek");
}
/**
* Return the next event, removing it from the queue.
*
* Must not be called if there are no events left.
*/
Event getEvent()
{
//Get the next event and proceed further.
if(currentEvent_.empty && state_ !is null)
{
currentEvent_ ~= state_();
}
if(!currentEvent_.empty)
{
Event result = currentEvent_[0];
currentEvent_.length = 0;
return result;
}
assert(false, "No event left to get");
}
private:
///Pop and return the newest state in states_.
Event delegate() popState()
{
enforce(states_.length > 0,
new YAMLException("Parser: Need to pop a state but there are no states left"));
const result = states_.back();
states_.popBack;
return result;
}
///Pop and return the newest mark in marks_.
Mark popMark()
{
enforce(marks_.length > 0,
new YAMLException("Parser: Need to pop a mark but there are no marks left"));
const result = marks_.back();
marks_.popBack;
return result;
}
/**
* stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
* implicit_document ::= block_node DOCUMENT-END*
* explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
*/
///Parse stream start.
Event parseStreamStart()
{
Token token = scanner_.getToken();
state_ = &parseImplicitDocumentStart;
return streamStartEvent(token.startMark, token.endMark);
}
///Parse implicit document start, unless explicit is detected: if so, parse explicit.
Event parseImplicitDocumentStart()
{
//Parse an implicit document.
if(!scanner_.checkToken(TokenID.Directive, TokenID.DocumentStart,
TokenID.StreamEnd))
{
tagHandles_ = defaultTags_;
Token token = scanner_.peekToken();
states_ ~= &parseDocumentEnd;
state_ = &parseBlockNode;
return documentStartEvent(token.startMark, token.endMark, false, null);
}
return parseDocumentStart();
}
///Parse explicit document start.
Event parseDocumentStart()
{
//Parse any extra document end indicators.
while(scanner_.checkToken(TokenID.DocumentEnd)){scanner_.getToken();}
//Parse an explicit document.
if(!scanner_.checkToken(TokenID.StreamEnd))
{
const startMark = scanner_.peekToken().startMark;
processDirectives();
enforce(scanner_.checkToken(TokenID.DocumentStart),
new ParserException("Expected document start but found " ~
to!string(scanner_.peekToken().id),
scanner_.peekToken().startMark));
const endMark = scanner_.getToken().endMark;
states_ ~= &parseDocumentEnd;
state_ = &parseDocumentContent;
return documentStartEvent(startMark, endMark, true, YAMLVersion_);
}
else
{
//Parse the end of the stream.
Token token = scanner_.getToken();
assert(states_.length == 0);
assert(marks_.length == 0);
state_ = null;
return streamEndEvent(token.startMark, token.endMark);
}
}
///Parse document end (explicit or implicit).
Event parseDocumentEnd()
{
Mark startMark = scanner_.peekToken().startMark;
const bool explicit = scanner_.checkToken(TokenID.DocumentEnd);
Mark endMark = explicit ? scanner_.getToken().endMark : startMark;
state_ = &parseDocumentStart;
return documentEndEvent(startMark, endMark, explicit);
}
///Parse document content.
Event parseDocumentContent()
{
if(scanner_.checkToken(TokenID.Directive, TokenID.DocumentStart,
TokenID.DocumentEnd, TokenID.StreamEnd))
{
state_ = popState();
return processEmptyScalar(scanner_.peekToken().startMark);
}
return parseBlockNode();
}
///Process directives at the beginning of a document.
void processDirectives()
{
//Destroy version and tag handles from previous document.
YAMLVersion_ = null;
string[string] empty;
tagHandles_ = empty;
//Process directives.
while(scanner_.checkToken(TokenID.Directive))
{
Token token = scanner_.getToken();
//Name and value are separated by '\0'.
const parts = token.value.split("\0");
const name = parts[0];
if(name == "YAML")
{
enforce(YAMLVersion_ is null,
new ParserException("Found duplicate YAML directive",
token.startMark));
const minor = parts[1].split(".")[0];
enforce(to!int(minor) == 1,
new ParserException("Found incompatible YAML document (version "
"1.* is required)", token.startMark));
YAMLVersion_ = parts[1];
}
else if(name == "TAG")
{
assert(parts.length == 3, "Tag directive stored incorrectly in a token");
const handle = parts[1];
foreach(h, replacement; tagHandles_)
{
enforce(h != handle, new ParserException("Duplicate tag handle: " ~
handle, token.startMark));
}
tagHandles_[handle] = parts[2];
}
}
//Add any default tag handles that haven't been overridden.
foreach(key, value; defaultTags_)
{
if((key in tagHandles_) is null){tagHandles_[key] = value;}
}
}
/**
* block_node_or_indentless_sequence ::= ALIAS
* | properties (block_content | indentless_block_sequence)?
* | block_content
* | indentless_block_sequence
* block_node ::= ALIAS
* | properties block_content?
* | block_content
* flow_node ::= ALIAS
* | properties flow_content?
* | flow_content
* properties ::= TAG ANCHOR? | ANCHOR TAG?
* block_content ::= block_collection | flow_collection | SCALAR
* flow_content ::= flow_collection | SCALAR
* block_collection ::= block_sequence | block_mapping
* flow_collection ::= flow_sequence | flow_mapping
*/
///Parse a node.
Event parseNode(bool block, bool indentlessSequence = false)
{
if(scanner_.checkToken(TokenID.Alias))
{
Token token = scanner_.getToken();
state_ = popState();
return aliasEvent(token.startMark, token.endMark, token.value);
}
string anchor = null;
string tag = null;
Mark startMark, endMark, tagMark;
bool invalidMarks = true;
//Get anchor/tag if detected. Return false otherwise.
bool get(TokenID id, bool start, ref string target)
{
if(!scanner_.checkToken(id)){return false;}
invalidMarks = false;
Token token = scanner_.getToken();
if(start){startMark = token.startMark;}
if(id == TokenID.Tag){tagMark = token.startMark;}
endMark = token.endMark;
target = token.value;
return true;
}
//Anchor and/or tag can be in any order.
if(get(TokenID.Anchor, true, anchor)){get(TokenID.Tag, false, tag);}
else if(get(TokenID.Tag, true, tag)) {get(TokenID.Anchor, false, anchor);}
if(tag !is null){tag = processTag(tag, startMark, tagMark);}
if(invalidMarks)
{
startMark = endMark = scanner_.peekToken().startMark;
}
bool implicit = (tag is null || tag == "!");
if(indentlessSequence && scanner_.checkToken(TokenID.BlockEntry))
{
state_ = &parseIndentlessSequenceEntry;
return sequenceStartEvent(startMark, scanner_.peekToken().endMark,
anchor, tag, implicit);
}
if(scanner_.checkToken(TokenID.Scalar))
{
Token token = scanner_.getToken();
//PyYAML uses a Tuple!(bool, bool) here, but the second bool
//is never used after that - so we don't use it.
implicit = (token.style == ScalarStyle.Plain && tag is null) || tag == "!";
state_ = popState();
return scalarEvent(startMark, token.endMark, anchor, tag,
implicit, token.value, token.style);
}
if(scanner_.checkToken(TokenID.FlowSequenceStart))
{
endMark = scanner_.peekToken().endMark;
state_ = &parseFlowSequenceEntry!true;
return sequenceStartEvent(startMark, endMark, anchor, tag, implicit);
}
if(scanner_.checkToken(TokenID.FlowMappingStart))
{
endMark = scanner_.peekToken().endMark;
state_ = &parseFlowMappingKey!true;
return mappingStartEvent(startMark, endMark, anchor, tag, implicit);
}
if(block && scanner_.checkToken(TokenID.BlockSequenceStart))
{
endMark = scanner_.peekToken().endMark;
state_ = &parseBlockSequenceEntry!true;
return sequenceStartEvent(startMark, endMark, anchor, tag, implicit);
}
if(block && scanner_.checkToken(TokenID.BlockMappingStart))
{
endMark = scanner_.peekToken().endMark;
state_ = &parseBlockMappingKey!true;
return mappingStartEvent(startMark, endMark, anchor, tag, implicit);
}
if(anchor != null || tag !is null)
{
state_ = popState();
//PyYAML uses a tuple(implicit, false) for the second last arg here,
//but the second bool is never used after that - so we don't use it.
//Empty scalars are allowed even if a tag or an anchor is specified.
return scalarEvent(startMark, endMark, anchor, tag, implicit , "");
}
Token token = scanner_.peekToken();
throw new ParserException("While parsing a " ~ (block ? "block" : "flow") ~ " node",
startMark, "expected the node content, but found: "
~ to!string(token.id), token.startMark);
}
/**
* Process a tag string retrieved from a tag token.
*
* Params: tag = Tag before processing.
* startMark = Position of the node the tag belongs to.
* tagMark = Position of the tag.
*/
string processTag(in string tag, in Mark startMark, in Mark tagMark)
{
//Tag handle and suffix are separated by '\0'.
const parts = tag.split("\0");
assert(parts.length == 2, "Tag data stored incorrectly in a token");
const handle = parts[0];
const suffix = parts[1];
if(handle.length > 0)
{
//handle must be in tagHandles_
enforce((handle in tagHandles_) !is null,
new ParserException("While parsing a node", startMark,
"found undefined tag handle: " ~ handle, tagMark));
return tagHandles_[handle] ~ suffix;
}
return suffix;
}
///Wrappers to parse nodes.
Event parseBlockNode(){return parseNode(true);}
Event parseFlowNode(){return parseNode(false);}
Event parseBlockNodeOrIndentlessSequence(){return parseNode(true, true);}
///block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
///Parse an entry of a block sequence. If first is true, this is the first entry.
Event parseBlockSequenceEntry(bool first)()
{
static if(first){marks_ ~= scanner_.getToken().startMark;}
if(scanner_.checkToken(TokenID.BlockEntry))
{
Token token = scanner_.getToken();
if(!scanner_.checkToken(TokenID.BlockEntry, TokenID.BlockEnd))
{
states_~= &parseBlockSequenceEntry!false;
return parseBlockNode();
}
state_ = &parseBlockSequenceEntry!false;
return processEmptyScalar(token.endMark);
}
if(!scanner_.checkToken(TokenID.BlockEnd))
{
Token token = scanner_.peekToken();
throw new ParserException("While parsing a block collection", marks_[$ - 1],
"expected block end, but found "
~ to!string(token.id), token.startMark);
}
state_ = popState();
popMark();
Token token = scanner_.getToken();
return sequenceEndEvent(token.startMark, token.endMark);
}
///indentless_sequence ::= (BLOCK-ENTRY block_node?)+
///Parse an entry of an indentless sequence.
Event parseIndentlessSequenceEntry()
{
if(scanner_.checkToken(TokenID.BlockEntry))
{
Token token = scanner_.getToken();
if(!scanner_.checkToken(TokenID.BlockEntry, TokenID.Key,
TokenID.Value, TokenID.BlockEnd))
{
states_ ~= &parseIndentlessSequenceEntry;
return parseBlockNode();
}
state_ = &parseIndentlessSequenceEntry;
return processEmptyScalar(token.endMark);
}
state_ = popState();
Token token = scanner_.peekToken();
return sequenceEndEvent(token.startMark, token.endMark);
}
/**
* block_mapping ::= BLOCK-MAPPING_START
* ((KEY block_node_or_indentless_sequence?)?
* (VALUE block_node_or_indentless_sequence?)?)*
* BLOCK-END
*/
///Parse a key in a block mapping. If first is true, this is the first key.
Event parseBlockMappingKey(bool first)()
{
static if(first){marks_ ~= scanner_.getToken().startMark;}
if(scanner_.checkToken(TokenID.Key))
{
Token token = scanner_.getToken();
if(!scanner_.checkToken(TokenID.Key, TokenID.Value, TokenID.BlockEnd))
{
states_ ~= &parseBlockMappingValue;
return parseBlockNodeOrIndentlessSequence();
}
state_ = &parseBlockMappingValue;
return processEmptyScalar(token.endMark);
}
if(!scanner_.checkToken(TokenID.BlockEnd))
{
Token token = scanner_.peekToken();
throw new ParserException("While parsing a block mapping", marks_[$ - 1],
"expected block end, but found: "
~ to!string(token.id), token.startMark);
}
state_ = popState();
popMark();
Token token = scanner_.getToken();
return mappingEndEvent(token.startMark, token.endMark);
}
///Parse a value in a block mapping.
Event parseBlockMappingValue()
{
if(scanner_.checkToken(TokenID.Value))
{
Token token = scanner_.getToken();
if(!scanner_.checkToken(TokenID.Key, TokenID.Value, TokenID.BlockEnd))
{
states_ ~= &parseBlockMappingKey!false;
return parseBlockNodeOrIndentlessSequence();
}
state_ = &parseBlockMappingKey!false;
return processEmptyScalar(token.endMark);
}
state_= &parseBlockMappingKey!false;
return processEmptyScalar(scanner_.peekToken().startMark);
}
/**
* flow_sequence ::= FLOW-SEQUENCE-START
* (flow_sequence_entry FLOW-ENTRY)*
* flow_sequence_entry?
* FLOW-SEQUENCE-END
* flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
*
* Note that while production rules for both flow_sequence_entry and
* flow_mapping_entry are equal, their interpretations are different.
* For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
* generate an inline mapping (set syntax).
*/
///Parse an entry in a flow sequence. If first is true, this is the first entry.
Event parseFlowSequenceEntry(bool first)()
{
static if(first){marks_ ~= scanner_.getToken().startMark;}
if(!scanner_.checkToken(TokenID.FlowSequenceEnd))
{
static if(!first)
{
if(scanner_.checkToken(TokenID.FlowEntry))
{
scanner_.getToken();
}
else
{
Token token = scanner_.peekToken;
throw new ParserException("While parsing a flow sequence",
marks_[$ - 1],
"expected ',' or ']', but got: " ~
to!string(token.id), token.startMark);
}
}
if(scanner_.checkToken(TokenID.Key))
{
Token token = scanner_.peekToken();
state_ = &parseFlowSequenceEntryMappingKey;
return mappingStartEvent(token.startMark, token.endMark, null, null, true);
}
else if(!scanner_.checkToken(TokenID.FlowSequenceEnd))
{
states_ ~= &parseFlowSequenceEntry!false;
return parseFlowNode();
}
}
Token token = scanner_.getToken();
state_ = popState();
popMark();
return sequenceEndEvent(token.startMark, token.endMark);
}
///Parse a key in flow context.
Event parseFlowKey(in Event delegate() nextState)
{
Token token = scanner_.getToken();
if(!scanner_.checkToken(TokenID.Value, TokenID.FlowEntry,
TokenID.FlowSequenceEnd))
{
states_ ~= nextState;
return parseFlowNode();
}
state_ = nextState;
return processEmptyScalar(token.endMark);
}
///Parse a mapping key in an entry in a flow sequence.
Event parseFlowSequenceEntryMappingKey()
{
return parseFlowKey(&parseFlowSequenceEntryMappingValue);
}
///Parse a mapping value in a flow context.
Event parseFlowValue(TokenID checkId, in Event delegate() nextState)
{
if(scanner_.checkToken(TokenID.Value))
{
Token token = scanner_.getToken();
if(!scanner_.checkToken(TokenID.FlowEntry, checkId))
{
states_ ~= nextState;
return parseFlowNode();
}
state_ = nextState;
return processEmptyScalar(token.endMark);
}
state_ = nextState;
return processEmptyScalar(scanner_.peekToken().startMark);
}
///Parse a mapping value in an entry in a flow sequence.
Event parseFlowSequenceEntryMappingValue()
{
return parseFlowValue(TokenID.FlowSequenceEnd,
&parseFlowSequenceEntryMappingEnd);
}
///Parse end of a mapping in a flow sequence entry.
Event parseFlowSequenceEntryMappingEnd()
{
state_ = &parseFlowSequenceEntry!false;
Token token = scanner_.peekToken();
return mappingEndEvent(token.startMark, token.startMark);
}
/**
* flow_mapping ::= FLOW-MAPPING-START
* (flow_mapping_entry FLOW-ENTRY)*
* flow_mapping_entry?
* FLOW-MAPPING-END
* flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
*/
///Parse a key in a flow mapping.
Event parseFlowMappingKey(bool first)()
{
static if(first){marks_ ~= scanner_.getToken().startMark;}
if(!scanner_.checkToken(TokenID.FlowMappingEnd))
{
static if(!first)
{
if(scanner_.checkToken(TokenID.FlowEntry))
{
scanner_.getToken();
}
else
{
Token token = scanner_.peekToken;
throw new ParserException("While parsing a flow mapping",
marks_[$ - 1],
"expected ',' or '}', but got: " ~
to!string(token.id), token.startMark);
}
}
if(scanner_.checkToken(TokenID.Key))
{
return parseFlowKey(&parseFlowMappingValue);
}
if(!scanner_.checkToken(TokenID.FlowMappingEnd))
{
states_ ~= &parseFlowMappingEmptyValue;
return parseFlowNode();
}
}
Token token = scanner_.getToken();
state_ = popState();
popMark();
return mappingEndEvent(token.startMark, token.endMark);
}
///Parse a value in a flow mapping.
Event parseFlowMappingValue()
{
return parseFlowValue(TokenID.FlowMappingEnd, &parseFlowMappingKey!false);
}
///Parse an empty value in a flow mapping.
Event parseFlowMappingEmptyValue()
{
state_ = &parseFlowMappingKey!false;
return processEmptyScalar(scanner_.peekToken().startMark);
}
///Return an empty scalar.
Event processEmptyScalar(in Mark mark)
{
//PyYAML uses a Tuple!(true, false) for the second last arg here,
//but the second bool is never used after that - so we don't use it.
return scalarEvent(mark, mark, null, null, true, "");
}
}