aboutsummaryrefslogtreecommitdiff
path: root/src/syntax
diff options
context:
space:
mode:
Diffstat (limited to 'src/syntax')
-rw-r--r--src/syntax/sexpr.cc1014
1 files changed, 378 insertions, 636 deletions
diff --git a/src/syntax/sexpr.cc b/src/syntax/sexpr.cc
index 14113164..0a3b8071 100644
--- a/src/syntax/sexpr.cc
+++ b/src/syntax/sexpr.cc
@@ -38,73 +38,24 @@
#include <iterator>
#include <open-axiom/sexpr>
#include <open-axiom/FileMapping>
+#include <open-axiom/diagnostics>
namespace OpenAxiom {
namespace Sexpr {
- template<typename T, int N>
- static inline int
- length(const T(&)[N]) {
- return N;
- }
-
- template<typename Sequence>
- static inline typename Sequence::const_pointer
- begin_ptr(const Sequence& s) {
- return &*s.begin();
- }
-
- template<typename Sequence>
- static inline typename Sequence::const_pointer
- end_ptr(const Sequence& s) {
- return s.empty() ? 0 : &*s.begin() + s.size();
- }
-
- std::ostream&
- operator<<(std::ostream& os, const Token& t) {
- switch (t.type) {
- case Token::semicolon: os << "SEMICOLON"; break;
- case Token::dot: os << "DOT"; break;
- case Token::comma: os << "COMMA"; break;
- case Token::open_paren: os << "OPEN_PAREN"; break;
- case Token::close_paren: os << "CLOSE_PAREN"; break;
- case Token::apostrophe: os << "APOSTROPHE"; break;
- case Token::backquote: os << "BACKQUOTE"; break;
- case Token::backslash: os << "BACKSLASH"; break;
- case Token::sharp_open_paren: os << "SHARP_OPEN_PAREN"; break;
- case Token::sharp_apostrophe: os << "SHARP_APOSTROPHE"; break;
- case Token::sharp_colon: os << "SHARP_COLON"; break;
- case Token::sharp_plus: os << "SHARP_PLUS"; break;
- case Token::sharp_minus: os << "SHARP_MINUS"; break;
- case Token::sharp_dot: os << "SHARP_DOT"; break;
- case Token::comma_at: os << "COMMA_AT"; break;
- case Token::integer: os << "INTEGER"; break;
- case Token::character: os << "CHARACTER"; break;
- case Token::string: os << "STRING"; break;
- case Token::identifier: os << "IDENTIFIER"; break;
- case Token::sharp_integer_sharp:
- os << "SHARP_INTEGER_SHARP"; break;
- case Token::sharp_integer_equal:
- os << "SHARP_INTEGER_EQUAL"; break;
- default: os << "UNKNOWN"; break;
- }
- os << '(';
- if (t.lexeme != 0) {
- os << '"';
- std::copy(t.lexeme->begin(), t.lexeme->end(),
- std::ostream_iterator<char>(os));
- os << '"';
- }
- else
- os << "<missing>";
- return os << ')';
+ static void
+ invalid_character(Reader::State& s) {
+ auto line = std::to_string(s.lineno);
+ auto column = std::to_string(s.cur - s.line);
+ auto msg = "invalid character on line " + line +
+ " and column " + column;
+ if (isprint(*s.cur))
+ throw Diagnostics::BasicError(msg + ": " + std::string(1, *s.cur));
+ throw Diagnostics::BasicError(msg + " with code " + std::to_string(*s.cur));
}
-
- // -----------
- // -- Lexer --
- // -----------
+
static void
syntax_error(const std::string& s) {
- throw BasicError(s);
+ throw Diagnostics::BasicError(s);
}
// Return true if character `c' introduces a blank.
@@ -122,67 +73,67 @@ namespace OpenAxiom {
or c == '`' or c == '#';
}
- // Move `cur' past all consecutive blank characters, and
- // return the new position.
- static const Byte*
- skip_blank(const Byte*& cur, const Byte* end) {
- while (cur < end and is_blank(*cur))
- ++cur;
- return cur;
+ // Move the cursor past all consecutive blank characters, and
+ // return true if there are more input characters to consider.
+ static bool
+ skip_blank(Reader::State& s) {
+ for (bool done = false; s.cur < s.end and not done; )
+ switch (*s.cur) {
+ case '\n':
+ ++s.lineno;
+ s.line = ++s.cur;
+ break;
+ case ' ': case '\t': case '\v': case '\r': case '\f':
+ ++s.cur;
+ break;
+ default: done = true; break;
+ }
+ return s.cur < s.end;
}
// Move `cur' to end-of-line marker.
- static const Byte*
- skip_to_eol(const Byte*& cur, const Byte* end) {
+ static void
+ skip_to_eol(Reader::State& s) {
// FIXME: properly handle CR+LF.
- while (cur < end and *cur != '\n')
- ++cur;
- return cur;
- }
-
- // Move `cur' until a word boundary is reached.
- static const Byte*
- skip_to_word_boundary(const Byte*& cur, const Byte* end) {
- bool saw_escape = false;
- for (; cur < end; ++cur) {
- if (saw_escape)
- saw_escape = false;
- else if (*cur == '\\')
- saw_escape = true;
- else if (is_delimiter(*cur))
- break;
- }
- return cur;
+ while (s.cur < s.end and *s.cur != '\n')
+ ++s.cur;
}
// Move `cur' one-past a non-esacaped character `c'.
// Return true if the character was seen.
static bool
- skip_to_nonescaped_char(const Byte*& cur, const Byte* end, char c) {
- bool saw_escape = false;
- for (; cur < end; ++cur)
+ skip_to_nonescaped_char(Reader::State& s, char c) {
+ for (bool saw_escape = false; s.cur < s.end; ++s.cur)
if (saw_escape)
saw_escape = false;
- else if (*cur == '\\')
+ else if (*s.cur == '\\')
saw_escape = true;
- else if (*cur == c) {
- ++cur;
+ else if (*s.cur == c) {
+ ++s.cur;
return true;
}
return false;
}
- // Move `cur' past the closing quote of string literal.
- // Return true if the closing fence was effectively seen.
+ // Move the cursor past the closing quote of string literal.
+ // Return true if the closing quote was effectively seen.
static inline bool
- skip_to_quote(const Byte*& cur, const Byte* end) {
- return skip_to_nonescaped_char(cur, end, '"');
+ skip_to_quote(Reader::State& s) {
+ return skip_to_nonescaped_char(s, '"');
+ }
+
+ template<typename Pred>
+ static bool
+ advance_while(Reader::State& s, Pred p) {
+ while (s.cur < s.end and p(*s.cur))
+ ++s.cur;
+ return s.cur < s.end;
}
// Return true if the character `c' be part of a non-absolute
// identifier.
static bool
- identifier_part(char c) {
+ identifier_part(Byte c) {
switch (c) {
case '+': case '-': case '*': case '/': case '%': case '^':
case '~': case '@': case '$': case '&': case '=':
@@ -194,296 +145,117 @@ namespace OpenAxiom {
}
}
- // Return true if the character `c' has a special meaning after
- // the sharp character.
- static bool
- special_after_sharp(char c) {
- return c == '(' or c == '\'' or c == ':'
- or c == '+' or c == '-' or c == '.';
- }
-
- // Return true if the sequence `[cur, end)' has a prefix that is
- // an integer followrd by the equal sign or the sharp sign.
- // `cur' is moved along the way.
- static bool
- only_digits_before_equal_or_shap(const Byte*& cur, const Byte* end) {
- while (cur < end and isdigit(*cur))
- ++cur;
- return cur < end and (*cur == '#' or *cur == '=');
- }
-
- // The token `t' was thought to designate an identifier.
- // Reclassify it as an integer if, in fact, its lexeme consists
- // entirely of digits.
- static void
- maybe_reclassify(Token& t) {
- const Byte* cur = t.lexeme->begin();
- const Byte* end = t.lexeme->end();
- while (cur < end and isdigit(*cur))
- ++cur;
- if (cur == end)
- t.type = Token::integer;
- }
-
- // Returns true if the first characters in the range
- // [cur, last) start an identifier.
- static bool
- start_symbol(const Byte* cur, const Byte* last) {
- if (cur >= last)
- return false;
- return identifier_part(*cur)
- or *cur == '|' or *cur == ':';
- }
-
- // We are processing a symbol token. Accumulate all
- // legitimate characters till the end of the token.
- static void
- skip_to_end_of_symbol(const Byte*& cur, const Byte* end) {
- const char c = *cur;
- if (*cur == '|')
- skip_to_nonescaped_char(++cur, end, c);
- else
- skip_to_word_boundary(cur, end);
- if (cur < end and *cur == ':')
- skip_to_end_of_symbol(cur, end);
- }
-
- static Token
- match_maybe_symbol(Lexer* lexer, const Byte*& cur, const Byte* end) {
- Token t = { Token::identifier, 0 };
- const Byte* start = cur;
- skip_to_end_of_symbol(cur, end);
- t.lexeme = lexer->intern(start, cur - start);
- maybe_reclassify(t);
- return t;
- }
-
- const Byte*
- Lexer::tokenize(const Byte* cur, const Byte* end) {
- while (skip_blank(cur, end) < end) {
- Token t = { Token::unknown, 0 };
- switch (*cur) {
- case ';': {
- const Byte* start = cur;
- t.type = Token::semicolon;
- skip_to_eol(cur, end);
- t.lexeme = intern(start, cur - start);
- break;
- }
-
- case '.': case '(': case ')': case '\'': case '`':
- t.type = Token::Type(token::value(*cur));
- t.lexeme = intern(cur, 1);
- ++cur;
- break;
-
- case ',': {
- const Byte* start = cur;
- if (++cur < end and *cur == '@') {
- t.type = Token::comma_at;
- ++cur;
- }
- else
- t.type = Token::comma;
- t.lexeme = intern(start, cur - start);
- break;
- }
-
- case '\\':
- t = match_maybe_symbol(this, cur, end);
- break;
-
- case '#': {
- const Byte* start = cur;
- if (cur + 1 < end and special_after_sharp(cur[1])) {
- t.type = Token::Type(token::value(cur[0], cur[1]));
- t.lexeme = intern(cur, 2);
- cur += 2;
- }
- else if (cur + 1 < end and cur[1] == '\\') {
- start = cur += 2;
- if (not isalnum(*cur))
- ++cur;
- else
- skip_to_word_boundary(cur, end);
- t.type = Token::character;
- t.lexeme = intern(start, cur - start);
- }
- else if (only_digits_before_equal_or_shap(++cur, end)) {
- t.type = *cur == '#'
- ? Token::sharp_integer_sharp
- : Token::sharp_integer_equal;
- t.lexeme = intern(start, cur - start + 1);
- ++cur;
- }
- else {
- skip_to_word_boundary(cur, end);
- t.lexeme = intern(start, cur - start);
- }
- break;
- }
-
- case '"': {
- const Byte* start = cur;
- skip_to_quote(++cur, end);
- t.type = Token::string;
- t.lexeme = intern(start, cur - start);
- break;
- }
-
- default:
- if (start_symbol(cur, end))
- t = match_maybe_symbol(this, cur, end);
- else {
- const Byte* start = cur;
- skip_to_word_boundary(++cur, end);
- t.lexeme = intern(start, cur - start);
- }
- break;
- }
- tokens.push_back(t);
- }
- return cur;
- }
-
- // ----------
- // -- Atom --
- // ----------
- Atom::Atom(const Token& t) : tok(t) { }
+ // -- AtomSyntax --
+ AtomSyntax::AtomSyntax(const Lexeme& t) : lex(t) { }
void
- Atom::accept(Visitor& v) const {
+ AtomSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // -------------
- // -- Integer --
- // -------------
- Integer::Integer(const Token& t) : Atom(t) { }
+ // -- IntegerSyntax --
+ IntegerSyntax::IntegerSyntax(const Lexeme& t) : AtomSyntax(t) { }
void
- Integer::accept(Visitor& v) const {
+ IntegerSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // ---------------
- // -- Character --
- // ---------------
- Character::Character(const Token& t) : Atom(t) { }
+ // -- CharacterSyntax --
+ CharacterSyntax::CharacterSyntax(const Lexeme& t) : AtomSyntax(t) { }
void
- Character::accept(Visitor& v) const {
+ CharacterSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // ------------
- // -- String --
- // ------------
- String::String(const Token& t) : Atom(t) { }
+ // -- StringSyntax --
+ StringSyntax::StringSyntax(const Lexeme& t) : AtomSyntax(t) { }
void
- String::accept(Visitor& v) const {
+ StringSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // ------------
- // -- Symbol --
- // ------------
- Symbol::Symbol(const Token& t, Kind k) : Atom(t), sort(k) { }
+ // -- SymbolSyntax --
+ SymbolSyntax::SymbolSyntax(const Lexeme& t, Kind k)
+ : AtomSyntax(t), sort(k)
+ { }
void
- Symbol::accept(Visitor& v) const {
+ SymbolSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // ------------
- // -- Anchor --
- // ------------
- Anchor::Anchor(size_t t, const Syntax* s) : tag(t), val(s) { }
+ // -- AnchorSyntax --
+ AnchorSyntax::AnchorSyntax(size_t t, const Syntax* s) : tag(t), val(s) { }
void
- Anchor::accept(Visitor& v) const {
+ AnchorSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // ---------------
// -- Reference --
- // ---------------
- Reference::Reference(const Token& t, size_t v) : Atom(t), pos(v) { }
+ Reference::Reference(const Lexeme& t, Ordinal n)
+ : AtomSyntax(t), pos(n)
+ { }
void
Reference::accept(Visitor& v) const {
v.visit(*this);
}
- // -----------
- // -- Quote --
- // -----------
- Quote::Quote(const Syntax* s) : unary_form<Quote>(s) { }
+ // -- QuoteSyntax --
+ QuoteSyntax::QuoteSyntax(const Syntax* s)
+ : unary_form<QuoteSyntax>(s)
+ { }
- // ---------------
- // -- Antiquote --
- // ---------------
- Antiquote::Antiquote(const Syntax* s) : unary_form<Antiquote>(s) { }
+ // -- AntiquoteSyntax --
+ AntiquoteSyntax::AntiquoteSyntax(const Syntax* s)
+ : unary_form<AntiquoteSyntax>(s)
+ { }
- // ------------
// -- Expand --
- // ------------
Expand::Expand(const Syntax* s) : unary_form<Expand>(s) { }
- // ----------
// -- Eval --
- // ----------
Eval::Eval(const Syntax* s) : unary_form<Eval>(s) { }
- // ------------
// -- Splice --
- // ------------
Splice::Splice(const Syntax* s) : unary_form<Splice>(s) { }
- // --------------
// -- Function --
- // --------------
Function::Function(const Syntax* s) : unary_form<Function>(s) { }
- // -------------
// -- Include --
Include::Include(const Syntax* s) : unary_form<Include>(s) { }
- // -------------
// -- Exclude --
Exclude::Exclude(const Syntax* s) : unary_form<Exclude>(s) { }
- // -------------
- // -- DotTail --
- // -------------
- DotTail::DotTail(const Syntax* f) : unary_form<DotTail>(f) { }
+ // -- ListSyntax --
+ ListSyntax::ListSyntax() : dot(false) { }
- // ----------
- // -- List --
- // ----------
- List::List() { }
+ ListSyntax::ListSyntax(const base& elts, bool d)
+ : base(elts), dot(d)
+ { }
- List::List(const base& elts) : base(elts) { }
-
- List::~List() { }
+ ListSyntax::~ListSyntax() { }
void
- List::accept(Visitor& v) const {
+ ListSyntax::accept(Visitor& v) const {
v.visit(*this);
}
- // ------------
- // -- Vector --
- // ------------
- Vector::Vector() { }
+ // -- VectorSyntax --
+ VectorSyntax::VectorSyntax() { }
- Vector::Vector(const base& elts) : base(elts) { }
+ VectorSyntax::VectorSyntax(const base& elts) : base(elts) { }
- Vector::~Vector() { }
+ VectorSyntax::~VectorSyntax() { }
void
- Vector::accept(Visitor& v) const {
+ VectorSyntax::accept(Visitor& v) const {
v.visit(*this);
}
@@ -499,28 +271,28 @@ namespace OpenAxiom {
}
void
- Syntax::Visitor::visit(const Integer& i) {
- visit(as<Atom>(i));
+ Syntax::Visitor::visit(const IntegerSyntax& i) {
+ visit(as<AtomSyntax>(i));
}
void
- Syntax::Visitor::visit(const Character& c) {
- visit(as<Atom>(c));
+ Syntax::Visitor::visit(const CharacterSyntax& c) {
+ visit(as<AtomSyntax>(c));
}
void
- Syntax::Visitor::visit(const String& s) {
- visit(as<Atom>(s));
+ Syntax::Visitor::visit(const StringSyntax& s) {
+ visit(as<AtomSyntax>(s));
}
void
- Syntax::Visitor::visit(const Symbol& s) {
- visit(as<Atom>(s));
+ Syntax::Visitor::visit(const SymbolSyntax& s) {
+ visit(as<AtomSyntax>(s));
}
void
Syntax::Visitor::visit(const Reference& r) {
- visit(as<Atom>(r));
+ visit(as<AtomSyntax>(r));
}
// ---------------
@@ -533,42 +305,42 @@ namespace OpenAxiom {
// used templates floating around.
Allocator::~Allocator() { }
- const Character*
- Allocator::make_character(const Token& t) {
+ const CharacterSyntax*
+ Allocator::make_character(const Lexeme& t) {
return chars.make(t);
}
- const Integer*
- Allocator::make_integer(const Token& t) {
+ const IntegerSyntax*
+ Allocator::make_integer(const Lexeme& t) {
return ints.make(t);
}
- const String*
- Allocator::make_string(const Token& t) {
+ const StringSyntax*
+ Allocator::make_string(const Lexeme& t) {
return strs.make(t);
}
- const Symbol*
- Allocator::make_symbol(const Token& t, Symbol::Kind k) {
+ const SymbolSyntax*
+ Allocator::make_symbol(SymbolSyntax::Kind k, const Lexeme& t) {
return syms.make(t, k);
}
- const Anchor*
- Allocator::make_anchor(size_t t, const Syntax* s) {
- return ancs.make(t, s);
- }
-
const Reference*
- Allocator::make_reference(const Token& t, size_t i) {
+ Allocator::make_reference(size_t i, const Lexeme& t) {
return refs.make(t, i);
}
- const Quote*
+ const AnchorSyntax*
+ Allocator::make_anchor(size_t t, const Syntax* s) {
+ return ancs.make(t, s);
+ }
+
+ const QuoteSyntax*
Allocator::make_quote(const Syntax* s) {
return quotes.make(s);
}
- const Antiquote*
+ const AntiquoteSyntax*
Allocator::make_antiquote(const Syntax* s) {
return antis.make(s);
}
@@ -603,53 +375,20 @@ namespace OpenAxiom {
return excs.make(s);
}
- const DotTail*
- Allocator::make_dot_tail(const Syntax* f) {
- return tails.make(f);
- }
-
- const List*
- Allocator::make_list(const std::vector<const Syntax*>& elts) {
+ const ListSyntax*
+ Allocator::make_list(const std::vector<const Syntax*>& elts, bool dot) {
if (elts.empty())
return &empty_list;
- return lists.make(elts);
+ return lists.make(elts, dot);
}
- const Vector*
+ const VectorSyntax*
Allocator::make_vector(const std::vector<const Syntax*>& elts) {
if (elts.empty())
return &empty_vector;
return vectors.make(elts);
}
- // ------------
- // -- Parser --
- // ------------
-
- // Signal a parse error
- static void
- parse_error(const std::string& s) {
- throw BasicError(s);
- }
-
- // Signal that an expected syntax object was missing
- static void
- expected_syntax(const std::string& s) {
- parse_error("expected " + s);
- }
-
- // Signal an abrupt end of input
- static void
- unexpected_end_of_input(const std::string& s) {
- parse_error("unexpected end of input after " + s);
- }
-
- // Signal a missing closing parenthesis
- static void
- missing_closer_for(const std::string& s) {
- parse_error("missing closing parenthesis for " + s);
- }
-
// The sequence of characters in [cur, last) consists
// entirely of digits. Return the corresponding natural value.
static size_t
@@ -661,274 +400,277 @@ namespace OpenAxiom {
return n;
}
- // Parse a plain identifier or a Lisp-style keyword identifier.
- const Symbol*
- Parser::parse_symbol(const Token*& cur, const Token* last) {
- Symbol::Kind kind = *cur->lexeme->begin() == ':'
- ? Symbol::keyword
- : Symbol::ordinary;
- return alloc.make_symbol(*cur++, kind);
- }
-
- // List of lower case character names
- static const char* charname[] = {
- "newline", "space", "page", "tab",
- "backspace", "return", "linefeed"
- };
-
- static bool
- equal_character_name(BasicString lhs, const char* rhs) {
- if (lhs->size() != strlen(rhs))
- return false;
- for (const Byte* cur = lhs->begin(); cur != lhs->end(); ++cur)
- if (tolower(*cur) != *rhs++)
- return false;
- return true;
- }
-
- static bool
- valid_character_name(BasicString s) {
- for (int i = 0; i < length(charname); ++i)
- if (equal_character_name(s, charname[i]))
- return true;
- return false;
- }
-
- const Character*
- Parser::parse_character(const Token*& cur, const Token* last) {
- if (cur->lexeme->size() != 1
- and not valid_character_name(cur->lexeme))
- parse_error("invalid literal character syntax");
- return alloc.make_character(*cur++);
- }
-
- // Parse an anchor definition of the form #n=<syntax>
- const Anchor*
- Parser::parse_anchor(const Token*& cur, const Token* last) {
- const size_t n = natural_value(cur->lexeme->begin() + 1,
- cur->lexeme->end() - 1);
- if (++cur == last)
- unexpected_end_of_input("sharp-integer-equal sign");
- return alloc.make_anchor(n, parse_syntax(cur, last));
- }
-
- // Parse a reference to an anchor, #n#
- const Reference*
- Parser::parse_reference(const Token*& cur, const Token* last) {
- const size_t n = natural_value(cur->lexeme->begin() + 1,
- cur->lexeme->end() - 1);
- return alloc.make_reference(*cur++, n);
- }
-
- // Parse an uninterned symbol #:<identifier>
- const Symbol*
- Parser::parse_uninterned(const Token*& cur, const Token* last) {
- if (cur == last or cur->type != Token::identifier)
- expected_syntax("symbol after sharp-colon sign");
- // FIXME: check that the identifier is not a keyword.
- return alloc.make_symbol(*cur++, Symbol::uninterned);
- }
-
- // Parse a function syntax: #'<syntax>
- const Function*
- Parser::parse_function(const Token*& cur, const Token* last) {
- if (cur == last)
- unexpected_end_of_input("sharp-quote sign");
- return alloc.make_function(parse_syntax(cur, last));
- }
-
- // Parse a quotation
- const Quote*
- Parser::parse_quote(const Token*& cur, const Token* last) {
- if (cur == last)
- unexpected_end_of_input("quote sign");
- return alloc.make_quote(parse_syntax(cur, last));
- }
-
- // Parse an antiquotation
- const Antiquote*
- Parser::parse_antiquote(const Token*& cur, const Token* last) {
- if (cur == last)
- unexpected_end_of_input("backquote sign");
- return alloc.make_antiquote(parse_syntax(cur, last));
- }
-
- // Parse an expansion request form
- const Expand*
- Parser::parse_expand(const Token*& cur, const Token* last) {
- const Syntax* s = parse_syntax(cur, last);
- if (s == 0)
- unexpected_end_of_input("comma sign");
- return alloc.make_expand(s);
- }
-
- // Parse conditional inclusions
- const Include*
- Parser::parse_include(const Token*& cur, const Token* last) {
- const Syntax* s = parse_syntax(cur, last);
- if (s == 0)
- unexpected_end_of_input("sharp-plus sign");
- return alloc.make_include(s);
- }
-
- const Exclude*
- Parser::parse_exclude(const Token*& cur, const Token* last) {
- const Syntax* s = parse_syntax(cur, last);
- if (s == 0)
- unexpected_end_of_input("sharp-minus sign");
- return alloc.make_exclude(s);
- }
-
- const Eval*
- Parser::parse_eval(const Token*& cur, const Token* last) {
- const Syntax* s = parse_syntax(cur, last);
- if (s == 0)
- unexpected_end_of_input("sharp-dot sign");
- return alloc.make_eval(s);
- }
-
- const Splice*
- Parser::parse_splice(const Token*& cur, const Token* last) {
- const Syntax* s = parse_syntax(cur, last);
- if (s == 0)
- unexpected_end_of_input("comma-at sign");
- return alloc.make_splice(s);
- }
-
- // Skip tokens that are semantically blanks, e.g. comments.
- // Return true if not at end of tokens.
- static bool
- skip_ignorable_tokens(const Token*& cur, const Token* last) {
- while (cur < last and cur->type == Token::semicolon)
- ++cur;
- return cur != last;
- }
-
- // Parse a vector of syntax objects: #(s .. s)
- const Vector*
- Parser::parse_vector(const Token*& cur, const Token* last) {
- std::vector<const Syntax*> elts;
- while (skip_ignorable_tokens(cur, last)
- and cur->type != Token::close_paren)
- elts.push_back(parse_syntax(cur, last));
- if (cur == last)
- missing_closer_for("vector");
- ++cur;
- return alloc.make_vector(elts);
- }
-
- // Constructs a pair or a list syntax object.
- const List*
- Parser::parse_list(const Token*& cur, const Token* last) {
- std::vector<const Syntax*> elts;
- while (skip_ignorable_tokens(cur, last)
- and cur->type != Token::close_paren) {
- if (cur->type == Token::dot) {
- skip_ignorable_tokens(++cur, last);
- if (const Syntax* s = parse_syntax(cur, last)) {
- elts.push_back(alloc.make_dot_tail(s));
- break;
- }
+ // -- Reader --
+ Reader::Reader(const Byte* f, const Byte* l)
+ : st{ f, l, f, f, 1, }
+ { }
+
+ static const Syntax* read_sexpr(Reader::State&);
+
+ // Parse a string literal
+ static const Syntax*
+ read_string(Reader::State& s) {
+ auto start = s.cur++;
+ if (not skip_to_quote(s))
+ syntax_error("missing closing quote sign for string literal");
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_string(t);
+ }
+
+ // Parse an absolute identifier.
+ static const Syntax*
+ read_absolute_symbol(Reader::State& s) {
+ auto start = ++s.cur;
+ if (not skip_to_nonescaped_char(s, '|'))
+ syntax_error("missing closing bar sign for an absolute symbol");
+ Lexeme t = { { start, s.cur - 1 }, s.lineno };
+ return s.alloc.make_symbol(SymbolSyntax::absolute, t);
+ }
+
+ // Read an atom starting with digits.
+ static const Syntax*
+ read_maybe_natural(Reader::State& s) {
+ auto start = s.cur;
+ advance_while (s, isdigit);
+ if (s.cur >= s.end or is_delimiter(*s.cur)) {
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_integer(t);
+ }
+ advance_while(s, identifier_part);
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_symbol(SymbolSyntax::ordinary, t);
+ }
+
+ // Read an identifier.
+ static const Syntax*
+ read_identifier(Reader::State& s) {
+ auto start = s.cur;
+ advance_while(s, identifier_part);
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_symbol(SymbolSyntax::ordinary, t);
+ }
+
+ // Read an atom starting with a '+' or '-' sign; this
+ // should be identifier, or a signed integer.
+ static const Syntax*
+ read_maybe_signed_number(Reader::State& s) {
+ auto start = s.cur++;
+ if (s.cur < s.end and isdigit(*s.cur)) {
+ advance_while(s, isdigit);
+ if (s.cur >= s.end or is_delimiter(*s.cur)) {
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_integer(t);
}
- elts.push_back(parse_syntax(cur, last));
}
- if (cur == last or cur->type != Token::close_paren)
- missing_closer_for("list");
- ++cur;
- return alloc.make_list(elts);
+ advance_while(s, identifier_part);
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_symbol(SymbolSyntax::ordinary, t);
}
- Parser::Parser(Allocator& a, std::vector<const Syntax*>& v)
- : alloc(a), syns(v) { }
-
- static std::string
- to_string(BasicString s) {
- return { s->begin(), s->end() };
+ static const Syntax*
+ read_keyword(Reader::State& s) {
+ auto start = s.cur++;
+ advance_while(s, identifier_part);
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_symbol(SymbolSyntax::keyword, t);
}
- const Syntax*
- Parser::parse_syntax(const Token*& cur, const Token* last) {
- if (not skip_ignorable_tokens(cur, last))
- return 0;
-
- switch (cur->type) {
- case Token::integer:
- return alloc.make_integer(*cur++);
-
- case Token::character:
- return parse_character(cur, last);
-
- case Token::string:
- return alloc.make_string(*cur++);
-
- case Token::identifier:
- return parse_symbol(cur, last);
-
- case Token::sharp_integer_equal:
- return parse_anchor(cur, last);
-
- case Token::sharp_integer_sharp:
- return parse_reference(cur, last);
-
- case Token::sharp_colon:
- return parse_uninterned(++cur, last);
+ // Read an atom.
+ static const Syntax*
+ read_atom(Reader::State& s) {
+ switch (*s.cur) {
+ case '"': return read_string(s);
+ case ':': return read_keyword(s);
+ case '-': case '+': return read_maybe_signed_number(s);
- case Token::sharp_apostrophe:
- return parse_function(++cur, last);
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ return read_maybe_natural(s);
- case Token::sharp_open_paren:
- return parse_vector(++cur, last);
-
- case Token::apostrophe:
- return parse_quote(++cur, last);
-
- case Token::open_paren:
- return parse_list(++cur, last);
-
- case Token::sharp_plus:
- return parse_include(++cur, last);
-
- case Token::sharp_minus:
- return parse_exclude(++cur, last);
-
- case Token::sharp_dot:
- return parse_eval(++cur, last);
-
- case Token::backquote:
- return parse_antiquote(++cur, last);
-
- case Token::comma:
- return parse_expand(++cur, last);
+ default:
+ if (identifier_part(*s.cur))
+ return read_identifier(s);
+ invalid_character(s);
+ ++s.cur;
+ return nullptr;
+ }
+ }
- case Token::comma_at:
- return parse_splice(++cur, last);
+ // Parse a quote expression.
+ static const Syntax*
+ read_quote(Reader::State& s) {
+ ++s.cur; // skip the quote character
+ auto x = read_sexpr(s);
+ if (x == nullptr)
+ syntax_error("end of input reached after quote sign");
+ return s.alloc.make_quote(x);
+ }
+
+ // Parse a backquote expression.
+ static const Syntax*
+ read_backquote(Reader::State& s) {
+ ++s.cur; // skip the backquote character
+ auto x = read_sexpr(s);
+ if (x == nullptr)
+ syntax_error("end of input reached after backquote sign");
+ return s.alloc.make_antiquote(x);
+ }
+
+ // We've just seen "#(" indicating the start of a literal
+ // vector. Read the elements and return the corresponding form.
+ static const Syntax*
+ finish_literal_vector(Reader::State& s) {
+ ++s.cur; // Skip the open paren.
+ std::vector<const Syntax*> elts { };
+ while (skip_blank(s) and *s.cur != ')') {
+ if (auto x = read_sexpr(s))
+ elts.push_back(x);
+ else
+ syntax_error("syntax error while reading vector elements");
+ }
+ if (s.cur >= s.end)
+ syntax_error("unfinished literal vector");
+ else
+ ++s.cur;
+ return s.alloc.make_vector(elts);
+ }
+
+ // We've just seen the sharp sign followed by a digit. We assume
+ // we are about to read an anchor or a back reference.
+ static const Syntax*
+ finish_anchor_or_reference(Reader::State& s) {
+ auto start = s.cur;
+ advance_while(s, isdigit);
+ if (s.cur >= s.end)
+ syntax_error("end-of-input after sharp-number sign");
+ const Byte c = *s.cur;
+ if (c != '#' and c != '=')
+ syntax_error("syntax error after sharp-number-equal sign");
+ Lexeme t = { { start, s.cur }, s.lineno };
+ auto n = natural_value(start, s.cur);
+ ++s.cur;
+ if (c == '#')
+ return s.alloc.make_reference(n, t);
+ auto x = read_sexpr(s);
+ if (x == nullptr)
+ syntax_error("syntax error after sharp-number-equal sign");
+ return s.alloc.make_anchor(n, x);
+ }
+
+ static const Syntax*
+ finish_function(Reader::State& s) {
+ ++s.cur; // skip quote sign.
+ auto x = read_sexpr(s);
+ if (x == nullptr)
+ syntax_error("missing function designator after sharp-quote sign");
+ return s.alloc.make_function(x);
+ }
+
+ static const Syntax*
+ finish_uninterned_symbol(Reader::State& s) {
+ ++s.cur; // skip colon sign.
+ auto start = s.cur;
+ advance_while(s, identifier_part);
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_symbol(SymbolSyntax::uninterned, t);
+ }
+
+ static const Syntax*
+ finish_readtime_eval(Reader::State& s) {
+ ++s.cur; // skip dot sign.
+ auto x = read_sexpr(s);
+ if (x == nullptr)
+ syntax_error("parse error after sharp-dot sign");
+ return s.alloc.make_eval(x);
+ }
+
+ static const Syntax*
+ finish_character(Reader::State& s) {
+ ++s.cur; // skip backslash sign
+ auto start = s.cur;
+ advance_while(s, identifier_part);
+ Lexeme t = { { start, s.cur }, s.lineno };
+ return s.alloc.make_character(t);
+ }
+
+ static const Syntax*
+ read_sharp_et_al(Reader::State& s) {
+ if (++s.cur >= s.end)
+ syntax_error("end-of-input reached after sharp sign");
+ switch (*s.cur) {
+ case '(': return finish_literal_vector(s);
+ case '\'': return finish_function(s);
+ case ':': return finish_uninterned_symbol(s);
+ case '.': return finish_readtime_eval(s);
+ case '\\': return finish_character(s);
default:
- parse_error(std::string("parse error before ")
- + to_string(cur->lexeme));
- return 0; // never executed
+ if (isdigit(*s.cur))
+ return finish_anchor_or_reference(s);
+ syntax_error("syntax error after sharp-sign");
}
- }
+ return nullptr;
+ }
+
+ // We have just seen a dot; read the tail and the closing parenthesis.
+ static const Syntax*
+ finish_dotted_list(Reader::State& s, std::vector<const Syntax*>& elts) {
+ ++s.cur; // Skip dot sign.
+ auto x = read_sexpr(s);
+ if (x == nullptr)
+ syntax_error("missing expression after dot sign");
+ if (not skip_blank(s) or *s.cur != ')')
+ syntax_error("missing closing parenthesis");
+ ++s.cur;
+ elts.push_back(x);
+ return s.alloc.make_list(elts, true);
+ }
+
+ static const Syntax*
+ read_pair(Reader::State& s) {
+ ++s.cur; // skip opening parenthesis
+ std::vector<const Syntax*> elts { };
+ while (skip_blank(s))
+ switch (*s.cur) {
+ case '.':
+ if (elts.empty())
+ syntax_error("missing expression before dot sign.");
+ return finish_dotted_list(s, elts);
+
+ case ')':
+ ++s.cur;
+ return s.alloc.make_list(elts);
- const Token*
- Parser::parse(const Token* cur, const Token* last) {
- while (cur < last)
- if (const Syntax* s = parse_syntax(cur, last))
- syns.push_back(s);
- return cur;
+ default:
+ if (auto x = read_sexpr(s))
+ elts.push_back(x);
+ else
+ syntax_error("unfinished pair expression");
+ break;
+ }
+ syntax_error("end-of-input while looking for closing parenthesis");
+ return nullptr;
+ }
+
+ static const Syntax*
+ read_sexpr(Reader::State& s) {
+ while (skip_blank(s))
+ switch (*s.cur) {
+ case ';': skip_to_eol(s); break;
+ case '\'': return read_quote(s);
+ case '`': return read_backquote(s);
+ case '|': return read_absolute_symbol(s);
+ case '#': return read_sharp_et_al(s);
+ case '(': return read_pair(s);
+ default: return read_atom(s);
+ }
+ return nullptr;
}
- Module::Module(const std::string& s) : nm(s) {
- std::vector<Token> tokens;
- Memory::FileMapping input(s);
- Lexer lexer(raw_strs, tokens);
- const Byte* rest = lexer.tokenize(input.begin(), input.end());
- if (rest != input.end())
- syntax_error("syntax error");
- Parser parser(allocator, *this);
- const Token* tok = parser.parse(begin_ptr(tokens), end_ptr(tokens));
- if (tok != end_ptr(tokens))
- parse_error("parse error");
+ const Syntax*
+ Reader::read() {
+ return read_sexpr(st);
}
+
}
}