// Copyright (C) 2010-2013, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in // the documentation and/or other materials provided with the // distribution. // // - Neither the name of The Numerical Algorithms Group Ltd. nor the // names of its contributors may be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER // OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // --% Author: Gabriel Dos Reis. #include #include #include #include #include #include #include namespace OpenAxiom { namespace Sexpr { static void invalid_character(Reader::State& s) { auto line = std::to_string(s.bytes.lineno); auto column = std::to_string(s.cur - s.line); auto msg = "invalid character on line " + line + " and column " + column; if (isprint(*s.cur)) throw Diagnostics::BasicError(msg + ": " + std::string(1, *s.cur)); throw Diagnostics::BasicError(msg + " with code " + std::to_string(*s.cur)); } static void syntax_error(const std::string& s) { throw Diagnostics::BasicError(s); } // Return true if character `c' introduces a blank. static bool is_blank(char c) { return c == ' ' or c == '\t' or c == '\v' or c == '\n' or c == '\f' or c == '\r'; } // Return true if the character `c' introduces a delimiter. static bool is_delimiter(char c) { return is_blank(c) or c == '(' or c == ')' or c == '\'' or c == '`' or c == '#'; } // Move the cursor past all consecutive blank characters, and // return true if there are more input characters to consider. static bool skip_blank(Reader::State& s) { for (bool done = false; s.cur < s.bytes.end and not done; ) switch (*s.cur) { case '\n': ++s.bytes.lineno; s.line = ++s.cur; break; case ' ': case '\t': case '\v': case '\r': case '\f': ++s.cur; break; default: done = true; break; } return s.cur < s.bytes.end; } // Move `cur' to end-of-line marker. static void skip_to_eol(Reader::State& s) { // FIXME: properly handle CR+LF. while (s.cur < s.bytes.end and *s.cur != '\n') ++s.cur; } // Move `cur' one-past a non-esacaped character `c'. // Return true if the character was seen. static bool skip_to_nonescaped_char(Reader::State& s, char c) { for (bool saw_escape = false; s.cur < s.bytes.end; ++s.cur) if (saw_escape) saw_escape = false; else if (*s.cur == '\\') saw_escape = true; else if (*s.cur == c) { ++s.cur; return true; } return false; } // Move the cursor past the closing quote of string literal. // Return true if the closing quote was effectively seen. static inline bool skip_to_quote(Reader::State& s) { return skip_to_nonescaped_char(s, '"'); } template static bool advance_while(Reader::State& s, Pred p) { while (s.cur < s.bytes.end and p(*s.cur)) ++s.cur; return s.cur < s.bytes.end; } // Return true if the character `c' be part of a non-absolute // identifier. static bool identifier_part(Byte c) { switch (c) { case '+': case '-': case '*': case '/': case '%': case '^': case '~': case '@': case '$': case '&': case '=': case '<': case '>': case '?': case '!': case '_': case '[': case ']': case '{': case '}': return true; default: return isalnum(c); } } // -- AtomSyntax -- AtomSyntax::AtomSyntax(const Lexeme& t) : lex(t) { } // -- IntegerSyntax -- IntegerSyntax::IntegerSyntax(const Lexeme& t) : AtomSyntax(t) { } void IntegerSyntax::accept(Visitor& v) const { v.visit(*this); } // -- CharacterSyntax -- CharacterSyntax::CharacterSyntax(const Lexeme& t) : AtomSyntax(t) { } void CharacterSyntax::accept(Visitor& v) const { v.visit(*this); } // -- StringSyntax -- StringSyntax::StringSyntax(const Lexeme& t) : AtomSyntax(t) { } void StringSyntax::accept(Visitor& v) const { v.visit(*this); } // -- SymbolSyntax -- SymbolSyntax::SymbolSyntax(const Lexeme& t, Kind k) : AtomSyntax(t), sort(k) { } void SymbolSyntax::accept(Visitor& v) const { v.visit(*this); } // -- AnchorSyntax -- AnchorSyntax::AnchorSyntax(size_t t, const Syntax* s) : tag(t), val(s) { } void AnchorSyntax::accept(Visitor& v) const { v.visit(*this); } // -- ReferenceSyntax -- ReferenceSyntax::ReferenceSyntax(const Lexeme& t, Ordinal n) : AtomSyntax(t), pos(n) { } void ReferenceSyntax::accept(Visitor& v) const { v.visit(*this); } // -- QuoteSyntax -- QuoteSyntax::QuoteSyntax(const Syntax* s) : unary_form(s) { } // -- AntiquoteSyntax -- AntiquoteSyntax::AntiquoteSyntax(const Syntax* s) : unary_form(s) { } // -- Expand -- Expand::Expand(const Syntax* s) : unary_form(s) { } // -- Eval -- Eval::Eval(const Syntax* s) : unary_form(s) { } // -- Splice -- Splice::Splice(const Syntax* s) : unary_form(s) { } // -- Function -- Function::Function(const Syntax* s) : unary_form(s) { } // -- Include -- Include::Include(const Syntax* c, const Syntax* s) : binary_form(c, s) { } // -- Exclude -- Exclude::Exclude(const Syntax* c, const Syntax* s) : binary_form(c, s) { } // -- ListSyntax -- ListSyntax::ListSyntax() : dot(false) { } ListSyntax::ListSyntax(const base& elts, bool d) : base(elts), dot(d) { } ListSyntax::~ListSyntax() { } void ListSyntax::accept(Visitor& v) const { v.visit(*this); } // -- VectorSyntax -- VectorSyntax::VectorSyntax() { } VectorSyntax::VectorSyntax(const base& elts) : base(elts) { } VectorSyntax::~VectorSyntax() { } void VectorSyntax::accept(Visitor& v) const { v.visit(*this); } // --------------- // -- Allocator -- // --------------- Allocator::Allocator() { } // This destructor is defined here so that it provides // a single instantiation point for destructors of all // used templates floating around. Allocator::~Allocator() { } const CharacterSyntax* Allocator::make_character(const Lexeme& t) { return chars.make(t); } const IntegerSyntax* Allocator::make_integer(const Lexeme& t) { return ints.make(t); } const StringSyntax* Allocator::make_string(const Lexeme& t) { return strs.make(t); } const SymbolSyntax* Allocator::make_symbol(SymbolSyntax::Kind k, const Lexeme& t) { return syms.make(t, k); } const ReferenceSyntax* Allocator::make_reference(size_t i, const Lexeme& t) { return refs.make(t, i); } const AnchorSyntax* Allocator::make_anchor(size_t t, const Syntax* s) { return ancs.make(t, s); } const QuoteSyntax* Allocator::make_quote(const Syntax* s) { return quotes.make(s); } const AntiquoteSyntax* Allocator::make_antiquote(const Syntax* s) { return antis.make(s); } const Expand* Allocator::make_expand(const Syntax* s) { return exps.make(s); } const Eval* Allocator::make_eval(const Syntax* s) { return evls.make(s); } const Splice* Allocator::make_splice(const Syntax* s) { return spls.make(s); } const Function* Allocator::make_function(const Syntax* s) { return funs.make(s); } const Include* Allocator::make_include(const Syntax* c, const Syntax* s) { return incs.make(c, s); } const Exclude* Allocator::make_exclude(const Syntax* c, const Syntax* s) { return excs.make(c, s); } const ListSyntax* Allocator::make_list(const std::vector& elts, bool dot) { if (elts.empty()) return &empty_list; return lists.make(elts, dot); } const VectorSyntax* Allocator::make_vector(const std::vector& elts) { if (elts.empty()) return &empty_vector; return vectors.make(elts); } // The sequence of characters in [cur, last) consists // entirely of digits. Return the corresponding natural value. static size_t natural_value(const Byte* cur, const Byte* last) { size_t n = 0; for (; cur < last; ++cur) // FIXME: check for overflow. n = 10 * n + (*cur - '0'); return n; } // -- Reader -- Reader::Reader(const Byte* f, const Byte* l) : st{ { f, l, 1 }, f, f } { } Reader::Reader(const RawInput& ri) : st { ri, ri.start, ri.start } { } static const Syntax* read_sexpr(Reader::State&); // Parse a string literal static const Syntax* read_string(Reader::State& s) { auto start = s.cur++; if (not skip_to_quote(s)) syntax_error("missing closing quote sign for string literal"); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_string(t); } // Parse an absolute identifier. static const Syntax* read_absolute_symbol(Reader::State& s) { auto start = ++s.cur; if (not skip_to_nonescaped_char(s, '|')) syntax_error("missing closing bar sign for an absolute symbol"); Lexeme t = { { start, s.cur - 1 }, s.bytes.lineno }; return s.alloc.make_symbol(SymbolSyntax::absolute, t); } // Read an atom starting with digits. static const Syntax* read_maybe_natural(Reader::State& s) { auto start = s.cur; advance_while (s, isdigit); if (s.cur >= s.bytes.end or is_delimiter(*s.cur)) { Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_integer(t); } advance_while(s, identifier_part); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_symbol(SymbolSyntax::ordinary, t); } // Read an identifier. static const Syntax* read_identifier(Reader::State& s) { auto start = s.cur; advance_while(s, identifier_part); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_symbol(SymbolSyntax::ordinary, t); } // Read an atom starting with a '+' or '-' sign; this // should be identifier, or a signed integer. static const Syntax* read_maybe_signed_number(Reader::State& s) { auto start = s.cur++; if (s.cur < s.bytes.end and isdigit(*s.cur)) { advance_while(s, isdigit); if (s.cur >= s.bytes.end or is_delimiter(*s.cur)) { Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_integer(t); } } advance_while(s, identifier_part); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_symbol(SymbolSyntax::ordinary, t); } static const Syntax* read_keyword(Reader::State& s) { auto start = s.cur++; advance_while(s, identifier_part); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_symbol(SymbolSyntax::keyword, t); } // Read an atom. static const Syntax* read_atom(Reader::State& s) { switch (*s.cur) { case '"': return read_string(s); case ':': return read_keyword(s); case '-': case '+': return read_maybe_signed_number(s); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return read_maybe_natural(s); default: if (identifier_part(*s.cur)) return read_identifier(s); invalid_character(s); ++s.cur; return nullptr; } } // Parse a quote expression. static const Syntax* read_quote(Reader::State& s) { ++s.cur; // skip the quote character auto x = read_sexpr(s); if (x == nullptr) syntax_error("end of input reached after quote sign"); return s.alloc.make_quote(x); } // Parse a backquote expression. static const Syntax* read_backquote(Reader::State& s) { ++s.cur; // skip the backquote character auto x = read_sexpr(s); if (x == nullptr) syntax_error("end of input reached after backquote sign"); return s.alloc.make_antiquote(x); } // We've just seen "#(" indicating the start of a literal // vector. Read the elements and return the corresponding form. static const Syntax* finish_literal_vector(Reader::State& s) { ++s.cur; // Skip the open paren. std::vector elts { }; while (skip_blank(s) and *s.cur != ')') { if (auto x = read_sexpr(s)) elts.push_back(x); else syntax_error("syntax error while reading vector elements"); } if (s.cur >= s.bytes.end) syntax_error("unfinished literal vector"); else ++s.cur; return s.alloc.make_vector(elts); } // We've just seen the sharp sign followed by a digit. We assume // we are about to read an anchor or a back reference. static const Syntax* finish_anchor_or_reference(Reader::State& s) { auto start = s.cur; advance_while(s, isdigit); if (s.cur >= s.bytes.end) syntax_error("end-of-input after sharp-number sign"); const Byte c = *s.cur; if (c != '#' and c != '=') syntax_error("syntax error after sharp-number-equal sign"); Lexeme t = { { start, s.cur }, s.bytes.lineno }; auto n = natural_value(start, s.cur); ++s.cur; if (c == '#') return s.alloc.make_reference(n, t); auto x = read_sexpr(s); if (x == nullptr) syntax_error("syntax error after sharp-number-equal sign"); return s.alloc.make_anchor(n, x); } static const Syntax* finish_function(Reader::State& s) { ++s.cur; // skip quote sign. auto x = read_sexpr(s); if (x == nullptr) syntax_error("missing function designator after sharp-quote sign"); return s.alloc.make_function(x); } static const Syntax* finish_uninterned_symbol(Reader::State& s) { ++s.cur; // skip colon sign. auto start = s.cur; advance_while(s, identifier_part); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_symbol(SymbolSyntax::uninterned, t); } static const Syntax* finish_readtime_eval(Reader::State& s) { ++s.cur; // skip dot sign. auto x = read_sexpr(s); if (x == nullptr) syntax_error("parse error after sharp-dot sign"); return s.alloc.make_eval(x); } static const Syntax* finish_character(Reader::State& s) { ++s.cur; // skip backslash sign auto start = s.cur; advance_while(s, identifier_part); Lexeme t = { { start, s.cur }, s.bytes.lineno }; return s.alloc.make_character(t); } static const Syntax* finish_include(Reader::State& s) { ++s.cur; auto cond = read_sexpr(s); auto form = read_sexpr(s); return s.alloc.make_include(cond, form); } static const Syntax* finish_exclude(Reader::State& s) { ++s.cur; auto cond = read_sexpr(s); auto form = read_sexpr(s); return s.alloc.make_exclude(cond, form); } static const Syntax* read_sharp_et_al(Reader::State& s) { if (++s.cur >= s.bytes.end) syntax_error("end-of-input reached after sharp sign"); switch (*s.cur) { case '(': return finish_literal_vector(s); case '\'': return finish_function(s); case ':': return finish_uninterned_symbol(s); case '.': return finish_readtime_eval(s); case '\\': return finish_character(s); case '+': return finish_include(s); case '-': return finish_exclude(s); default: if (isdigit(*s.cur)) return finish_anchor_or_reference(s); syntax_error("syntax error after sharp-sign"); } return nullptr; } // We have just seen a dot; read the tail and the closing parenthesis. static const Syntax* finish_dotted_list(Reader::State& s, std::vector& elts) { ++s.cur; // Skip dot sign. auto x = read_sexpr(s); if (x == nullptr) syntax_error("missing expression after dot sign"); if (not skip_blank(s) or *s.cur != ')') syntax_error("missing closing parenthesis"); ++s.cur; elts.push_back(x); return s.alloc.make_list(elts, true); } static const Syntax* read_pair(Reader::State& s) { ++s.cur; // skip opening parenthesis std::vector elts { }; while (skip_blank(s)) switch (*s.cur) { case '.': if (elts.empty()) syntax_error("missing expression before dot sign."); return finish_dotted_list(s, elts); case ')': ++s.cur; return s.alloc.make_list(elts); default: if (auto x = read_sexpr(s)) elts.push_back(x); else syntax_error("unfinished pair expression"); break; } syntax_error("end-of-input while looking for closing parenthesis"); return nullptr; } static const Syntax* read_sexpr(Reader::State& s) { while (skip_blank(s)) switch (*s.cur) { case ';': skip_to_eol(s); break; case '\'': return read_quote(s); case '`': return read_backquote(s); case '|': return read_absolute_symbol(s); case '#': return read_sharp_et_al(s); case '(': return read_pair(s); default: return read_atom(s); } return nullptr; } const Syntax* Reader::read() { return read_sexpr(st); } const Byte* Reader::position(Ordinal p) { st.cur = st.bytes.start + p; st.line = st.cur; // while (st.line > st.start and st.line[-1] != '\n') // --st.line; return st.cur; } } }