diff options
author | Gabriel Dos Reis <gdr@axiomatics.org> | 2017-01-15 09:14:39 -0800 |
---|---|---|
committer | Gabriel Dos Reis <gdr@axiomatics.org> | 2017-01-15 09:14:39 -0800 |
commit | cea95f2cc443a5bee5cb69476ae73fd99d32fc0d (patch) | |
tree | ca8765a4f777fbc131423b13e17ca2be56b6a251 /src/syntax/sexpr.cxx | |
parent | 77be242e6af75148771620fcbbaad191d1c11b60 (diff) | |
download | open-axiom-cea95f2cc443a5bee5cb69476ae73fd99d32fc0d.tar.gz |
Add s-expression parsing to libopen-axiom-core.a
Diffstat (limited to 'src/syntax/sexpr.cxx')
-rw-r--r-- | src/syntax/sexpr.cxx | 670 |
1 files changed, 670 insertions, 0 deletions
diff --git a/src/syntax/sexpr.cxx b/src/syntax/sexpr.cxx new file mode 100644 index 00000000..f9d76825 --- /dev/null +++ b/src/syntax/sexpr.cxx @@ -0,0 +1,670 @@ +// Copyright (C) 2010-2013, Gabriel Dos Reis. +// All rights reserved. +// Written by Gabriel Dos Reis. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// +// - Neither the name of The Numerical Algorithms Group Ltd. nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// --% Author: Gabriel Dos Reis. + +#include <ctype.h> +#include <string.h> +#include <iostream> +#include <iterator> +#include <open-axiom/sexpr> +#include <open-axiom/FileMapping> +#include <open-axiom/diagnostics> + +namespace OpenAxiom { + namespace Sexpr { + static void + invalid_character(Reader::State& s) { + auto line = std::to_string(s.bytes.lineno); + auto column = std::to_string(s.cur - s.line); + auto msg = "invalid character on line " + line + + " and column " + column; + if (isprint(*s.cur)) + throw Diagnostics::BasicError(msg + ": " + std::string(1, *s.cur)); + throw Diagnostics::BasicError(msg + " with code " + std::to_string(*s.cur)); + } + + static void + syntax_error(const std::string& s) { + throw Diagnostics::BasicError(s); + } + + // Return true if character `c' introduces a blank. + static bool + is_blank(char c) { + return c == ' ' or c == '\t' or c == '\v' + or c == '\n' or c == '\f' or c == '\r'; + } + + // Return true if the character `c' introduces a delimiter. + static bool + is_delimiter(char c) { + return is_blank(c) + or c == '(' or c == ')' or c == '\'' + or c == '`' or c == '#'; + } + + // Move the cursor past all consecutive blank characters, and + // return true if there are more input characters to consider. + static bool + skip_blank(Reader::State& s) { + for (bool done = false; s.cur < s.bytes.end and not done; ) + switch (*s.cur) { + case '\n': + ++s.bytes.lineno; + s.line = ++s.cur; + break; + case ' ': case '\t': case '\v': case '\r': case '\f': + ++s.cur; + break; + default: done = true; break; + } + return s.cur < s.bytes.end; + } + + // Move `cur' to end-of-line marker. + static void + skip_to_eol(Reader::State& s) { + // FIXME: properly handle CR+LF. + while (s.cur < s.bytes.end and *s.cur != '\n') + ++s.cur; + } + + // Move `cur' one-past a non-esacaped character `c'. + // Return true if the character was seen. + static bool + skip_to_nonescaped_char(Reader::State& s, char c) { + for (bool saw_escape = false; s.cur < s.bytes.end; ++s.cur) + if (saw_escape) + saw_escape = false; + else if (*s.cur == '\\') + saw_escape = true; + else if (*s.cur == c) { + ++s.cur; + return true; + } + return false; + } + + // Move the cursor past the closing quote of string literal. + // Return true if the closing quote was effectively seen. + static inline bool + skip_to_quote(Reader::State& s) { + return skip_to_nonescaped_char(s, '"'); + } + + template<typename Pred> + static bool + advance_while(Reader::State& s, Pred p) { + while (s.cur < s.bytes.end and p(*s.cur)) + ++s.cur; + return s.cur < s.bytes.end; + } + + // Return true if the character `c' be part of a non-absolute + // identifier. + static bool + identifier_part(Byte c) { + switch (c) { + case '+': case '-': case '*': case '/': case '%': case '^': + case '~': case '@': case '$': case '&': case '=': + case '<': case '>': case '?': case '!': case '_': + case '[': case ']': case '{': case '}': + return true; + default: + return isalnum(c); + } + } + + // -- AtomSyntax -- + AtomSyntax::AtomSyntax(const Lexeme& t) : lex(t) { } + + // -- IntegerSyntax -- + IntegerSyntax::IntegerSyntax(const Lexeme& t) : AtomSyntax(t) { } + + void + IntegerSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- CharacterSyntax -- + CharacterSyntax::CharacterSyntax(const Lexeme& t) : AtomSyntax(t) { } + + void + CharacterSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- StringSyntax -- + StringSyntax::StringSyntax(const Lexeme& t) : AtomSyntax(t) { } + + void + StringSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- SymbolSyntax -- + SymbolSyntax::SymbolSyntax(const Lexeme& t, Kind k) + : AtomSyntax(t), sort(k) + { } + + void + SymbolSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- AnchorSyntax -- + AnchorSyntax::AnchorSyntax(size_t t, const Syntax* s) : tag(t), val(s) { } + + void + AnchorSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- ReferenceSyntax -- + ReferenceSyntax::ReferenceSyntax(const Lexeme& t, Ordinal n) + : AtomSyntax(t), pos(n) + { } + + void + ReferenceSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- QuoteSyntax -- + QuoteSyntax::QuoteSyntax(const Syntax* s) + : unary_form<QuoteSyntax>(s) + { } + + // -- AntiquoteSyntax -- + AntiquoteSyntax::AntiquoteSyntax(const Syntax* s) + : unary_form<AntiquoteSyntax>(s) + { } + + // -- Expand -- + Expand::Expand(const Syntax* s) : unary_form<Expand>(s) { } + + // -- Eval -- + Eval::Eval(const Syntax* s) : unary_form<Eval>(s) { } + + // -- Splice -- + Splice::Splice(const Syntax* s) : unary_form<Splice>(s) { } + + // -- Function -- + Function::Function(const Syntax* s) : unary_form<Function>(s) { } + + // -- Include -- + Include::Include(const Syntax* c, const Syntax* s) + : binary_form<Include>(c, s) + { } + + // -- Exclude -- + Exclude::Exclude(const Syntax* c, const Syntax* s) + : binary_form<Exclude>(c, s) + { } + + // -- ListSyntax -- + ListSyntax::ListSyntax() : dot(false) { } + + ListSyntax::ListSyntax(const base& elts, bool d) + : base(elts), dot(d) + { } + + ListSyntax::~ListSyntax() { } + + void + ListSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // -- VectorSyntax -- + VectorSyntax::VectorSyntax() { } + + VectorSyntax::VectorSyntax(const base& elts) : base(elts) { } + + VectorSyntax::~VectorSyntax() { } + + void + VectorSyntax::accept(Visitor& v) const { + v.visit(*this); + } + + // --------------- + // -- Allocator -- + // --------------- + Allocator::Allocator() { } + + // This destructor is defined here so that it provides + // a single instantiation point for destructors of all + // used templates floating around. + Allocator::~Allocator() { } + + const CharacterSyntax* + Allocator::make_character(const Lexeme& t) { + return chars.make(t); + } + + const IntegerSyntax* + Allocator::make_integer(const Lexeme& t) { + return ints.make(t); + } + + const StringSyntax* + Allocator::make_string(const Lexeme& t) { + return strs.make(t); + } + + const SymbolSyntax* + Allocator::make_symbol(SymbolSyntax::Kind k, const Lexeme& t) { + return syms.make(t, k); + } + + const ReferenceSyntax* + Allocator::make_reference(size_t i, const Lexeme& t) { + return refs.make(t, i); + } + + const AnchorSyntax* + Allocator::make_anchor(size_t t, const Syntax* s) { + return ancs.make(t, s); + } + + const QuoteSyntax* + Allocator::make_quote(const Syntax* s) { + return quotes.make(s); + } + + const AntiquoteSyntax* + Allocator::make_antiquote(const Syntax* s) { + return antis.make(s); + } + + const Expand* + Allocator::make_expand(const Syntax* s) { + return exps.make(s); + } + + const Eval* + Allocator::make_eval(const Syntax* s) { + return evls.make(s); + } + + const Splice* + Allocator::make_splice(const Syntax* s) { + return spls.make(s); + } + + const Function* + Allocator::make_function(const Syntax* s) { + return funs.make(s); + } + + const Include* + Allocator::make_include(const Syntax* c, const Syntax* s) { + return incs.make(c, s); + } + + const Exclude* + Allocator::make_exclude(const Syntax* c, const Syntax* s) { + return excs.make(c, s); + } + + const ListSyntax* + Allocator::make_list(const std::vector<const Syntax*>& elts, bool dot) { + if (elts.empty()) + return &empty_list; + return lists.make(elts, dot); + } + + const VectorSyntax* + Allocator::make_vector(const std::vector<const Syntax*>& elts) { + if (elts.empty()) + return &empty_vector; + return vectors.make(elts); + } + + // The sequence of characters in [cur, last) consists + // entirely of digits. Return the corresponding natural value. + static size_t + natural_value(const Byte* cur, const Byte* last) { + size_t n = 0; + for (; cur < last; ++cur) + // FIXME: check for overflow. + n = 10 * n + (*cur - '0'); + return n; + } + + // -- Reader -- + Reader::Reader(const Byte* f, const Byte* l) + : st{ { f, l, 1 }, f, f } + { } + + Reader::Reader(const RawInput& ri) + : st { ri, ri.start, ri.start } + { } + + static const Syntax* read_sexpr(Reader::State&); + + // Parse a string literal + static const Syntax* + read_string(Reader::State& s) { + auto start = s.cur++; + if (not skip_to_quote(s)) + syntax_error("missing closing quote sign for string literal"); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_string(t); + } + + // Parse an absolute identifier. + static const Syntax* + read_absolute_symbol(Reader::State& s) { + auto start = ++s.cur; + if (not skip_to_nonescaped_char(s, '|')) + syntax_error("missing closing bar sign for an absolute symbol"); + Lexeme t = { { start, s.cur - 1 }, s.bytes.lineno }; + return s.alloc.make_symbol(SymbolSyntax::absolute, t); + } + + // Read an atom starting with digits. + static const Syntax* + read_maybe_natural(Reader::State& s) { + auto start = s.cur; + advance_while (s, isdigit); + if (s.cur >= s.bytes.end or is_delimiter(*s.cur)) { + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_integer(t); + } + advance_while(s, identifier_part); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_symbol(SymbolSyntax::ordinary, t); + } + + // Read an identifier. + static const Syntax* + read_identifier(Reader::State& s) { + auto start = s.cur; + advance_while(s, identifier_part); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_symbol(SymbolSyntax::ordinary, t); + } + + // Read an atom starting with a '+' or '-' sign; this + // should be identifier, or a signed integer. + static const Syntax* + read_maybe_signed_number(Reader::State& s) { + auto start = s.cur++; + if (s.cur < s.bytes.end and isdigit(*s.cur)) { + advance_while(s, isdigit); + if (s.cur >= s.bytes.end or is_delimiter(*s.cur)) { + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_integer(t); + } + } + advance_while(s, identifier_part); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_symbol(SymbolSyntax::ordinary, t); + } + + static const Syntax* + read_keyword(Reader::State& s) { + auto start = s.cur++; + advance_while(s, identifier_part); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_symbol(SymbolSyntax::keyword, t); + } + + // Read an atom. + static const Syntax* + read_atom(Reader::State& s) { + switch (*s.cur) { + case '"': return read_string(s); + case ':': return read_keyword(s); + case '-': case '+': return read_maybe_signed_number(s); + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return read_maybe_natural(s); + + default: + if (identifier_part(*s.cur)) + return read_identifier(s); + invalid_character(s); + ++s.cur; + return nullptr; + } + } + + // Parse a quote expression. + static const Syntax* + read_quote(Reader::State& s) { + ++s.cur; // skip the quote character + auto x = read_sexpr(s); + if (x == nullptr) + syntax_error("end of input reached after quote sign"); + return s.alloc.make_quote(x); + } + + // Parse a backquote expression. + static const Syntax* + read_backquote(Reader::State& s) { + ++s.cur; // skip the backquote character + auto x = read_sexpr(s); + if (x == nullptr) + syntax_error("end of input reached after backquote sign"); + return s.alloc.make_antiquote(x); + } + + // We've just seen "#(" indicating the start of a literal + // vector. Read the elements and return the corresponding form. + static const Syntax* + finish_literal_vector(Reader::State& s) { + ++s.cur; // Skip the open paren. + std::vector<const Syntax*> elts { }; + while (skip_blank(s) and *s.cur != ')') { + if (auto x = read_sexpr(s)) + elts.push_back(x); + else + syntax_error("syntax error while reading vector elements"); + } + if (s.cur >= s.bytes.end) + syntax_error("unfinished literal vector"); + else + ++s.cur; + return s.alloc.make_vector(elts); + } + + // We've just seen the sharp sign followed by a digit. We assume + // we are about to read an anchor or a back reference. + static const Syntax* + finish_anchor_or_reference(Reader::State& s) { + auto start = s.cur; + advance_while(s, isdigit); + if (s.cur >= s.bytes.end) + syntax_error("end-of-input after sharp-number sign"); + const Byte c = *s.cur; + if (c != '#' and c != '=') + syntax_error("syntax error after sharp-number-equal sign"); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + auto n = natural_value(start, s.cur); + ++s.cur; + if (c == '#') + return s.alloc.make_reference(n, t); + auto x = read_sexpr(s); + if (x == nullptr) + syntax_error("syntax error after sharp-number-equal sign"); + return s.alloc.make_anchor(n, x); + } + + static const Syntax* + finish_function(Reader::State& s) { + ++s.cur; // skip quote sign. + auto x = read_sexpr(s); + if (x == nullptr) + syntax_error("missing function designator after sharp-quote sign"); + return s.alloc.make_function(x); + } + + static const Syntax* + finish_uninterned_symbol(Reader::State& s) { + ++s.cur; // skip colon sign. + auto start = s.cur; + advance_while(s, identifier_part); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_symbol(SymbolSyntax::uninterned, t); + } + + static const Syntax* + finish_readtime_eval(Reader::State& s) { + ++s.cur; // skip dot sign. + auto x = read_sexpr(s); + if (x == nullptr) + syntax_error("parse error after sharp-dot sign"); + return s.alloc.make_eval(x); + } + + static const Syntax* + finish_character(Reader::State& s) { + ++s.cur; // skip backslash sign + auto start = s.cur; + advance_while(s, identifier_part); + Lexeme t = { { start, s.cur }, s.bytes.lineno }; + return s.alloc.make_character(t); + } + + static const Syntax* + finish_include(Reader::State& s) { + ++s.cur; + auto cond = read_sexpr(s); + auto form = read_sexpr(s); + return s.alloc.make_include(cond, form); + } + + static const Syntax* + finish_exclude(Reader::State& s) { + ++s.cur; + auto cond = read_sexpr(s); + auto form = read_sexpr(s); + return s.alloc.make_exclude(cond, form); + } + + static const Syntax* + read_sharp_et_al(Reader::State& s) { + if (++s.cur >= s.bytes.end) + syntax_error("end-of-input reached after sharp sign"); + switch (*s.cur) { + case '(': return finish_literal_vector(s); + case '\'': return finish_function(s); + case ':': return finish_uninterned_symbol(s); + case '.': return finish_readtime_eval(s); + case '\\': return finish_character(s); + case '+': return finish_include(s); + case '-': return finish_exclude(s); + + default: + if (isdigit(*s.cur)) + return finish_anchor_or_reference(s); + syntax_error("syntax error after sharp-sign"); + } + return nullptr; + } + + // We have just seen a dot; read the tail and the closing parenthesis. + static const Syntax* + finish_dotted_list(Reader::State& s, std::vector<const Syntax*>& elts) { + ++s.cur; // Skip dot sign. + auto x = read_sexpr(s); + if (x == nullptr) + syntax_error("missing expression after dot sign"); + if (not skip_blank(s) or *s.cur != ')') + syntax_error("missing closing parenthesis"); + ++s.cur; + elts.push_back(x); + return s.alloc.make_list(elts, true); + } + + static const Syntax* + read_pair(Reader::State& s) { + ++s.cur; // skip opening parenthesis + std::vector<const Syntax*> elts { }; + while (skip_blank(s)) + switch (*s.cur) { + case '.': + if (elts.empty()) + syntax_error("missing expression before dot sign."); + return finish_dotted_list(s, elts); + + case ')': + ++s.cur; + return s.alloc.make_list(elts); + + default: + if (auto x = read_sexpr(s)) + elts.push_back(x); + else + syntax_error("unfinished pair expression"); + break; + } + syntax_error("end-of-input while looking for closing parenthesis"); + return nullptr; + } + + static const Syntax* + read_sexpr(Reader::State& s) { + while (skip_blank(s)) + switch (*s.cur) { + case ';': skip_to_eol(s); break; + case '\'': return read_quote(s); + case '`': return read_backquote(s); + case '|': return read_absolute_symbol(s); + case '#': return read_sharp_et_al(s); + case '(': return read_pair(s); + default: return read_atom(s); + } + return nullptr; + } + + const Syntax* + Reader::read() { + return read_sexpr(st); + } + + const Byte* + Reader::position(Ordinal p) { + st.cur = st.bytes.start + p; + st.line = st.cur; + // while (st.line > st.start and st.line[-1] != '\n') + // --st.line; + return st.cur; + } + + } +} |