1 files changed, 670 insertions, 0 deletions
diff --git a/src/syntax/sexpr.cxx b/src/syntax/sexpr.cxx
new file mode 100644
index 00000000..f9d76825
--- /dev/null
+++ b/src/syntax/sexpr.cxx
@@ -0,0 +1,670 @@
+// Copyright (C) 2010-2013, Gabriel Dos Reis.
+// All rights reserved.
+// Written by Gabriel Dos Reis.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     - Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//
+//     - Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in
+//       the documentation and/or other materials provided with the
+//       distribution.
+//
+//     - Neither the name of The Numerical Algorithms Group Ltd. nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// --% Author: Gabriel Dos Reis.
+
+#include <ctype.h>
+#include <string.h>
+#include <iostream>
+#include <iterator>
+#include <open-axiom/sexpr>
+#include <open-axiom/FileMapping>
+#include <open-axiom/diagnostics>
+
+namespace OpenAxiom {
+   namespace Sexpr {
+      static void
+      invalid_character(Reader::State& s) {
+         auto line = std::to_string(s.bytes.lineno);
+         auto column = std::to_string(s.cur - s.line);
+         auto msg = "invalid character on line " + line +
+            " and column " + column;
+         if (isprint(*s.cur))
+            throw Diagnostics::BasicError(msg + ": " + std::string(1, *s.cur));
+         throw Diagnostics::BasicError(msg + " with code " + std::to_string(*s.cur));
+      }
+      
+      static void
+      syntax_error(const std::string& s) {
+         throw Diagnostics::BasicError(s);
+      }
+
+      // Return true if character `c' introduces a blank.
+      static bool
+      is_blank(char c) {
+         return c == ' ' or c == '\t' or c == '\v'
+            or c == '\n' or c == '\f' or c == '\r';
+      }
+      
+      // Return true if the character `c' introduces a delimiter.
+      static bool
+      is_delimiter(char c) {
+         return is_blank(c)
+            or c == '(' or c == ')' or c == '\''
+            or c == '`' or c == '#';
+      }
+
+      // Move the cursor past all consecutive blank characters, and
+      // return true if there are more input characters to consider.
+      static bool
+      skip_blank(Reader::State& s) {
+         for (bool done = false; s.cur < s.bytes.end and not done; )
+            switch (*s.cur) {
+            case '\n':
+               ++s.bytes.lineno;
+               s.line = ++s.cur;
+               break;
+            case ' ': case '\t': case '\v': case '\r': case '\f':
+               ++s.cur;
+               break;
+            default: done = true; break;
+            }
+         return s.cur < s.bytes.end;
+      }
+
+      // Move `cur' to end-of-line marker.
+      static void
+      skip_to_eol(Reader::State& s) {
+         // FIXME: properly handle CR+LF.
+         while (s.cur < s.bytes.end and *s.cur != '\n')
+            ++s.cur;
+      }
+
+      // Move `cur' one-past a non-esacaped character `c'.
+      // Return true if the character was seen.
+      static bool
+      skip_to_nonescaped_char(Reader::State& s, char c) {
+         for (bool saw_escape = false; s.cur < s.bytes.end; ++s.cur)
+            if (saw_escape)
+               saw_escape = false;
+            else if (*s.cur == '\\')
+               saw_escape = true;
+            else if (*s.cur == c) {
+               ++s.cur;
+               return true;
+            }
+         return false;
+      }
+
+      // Move the cursor past the closing quote of string literal.
+      // Return true if the closing quote was effectively seen.
+      static inline bool
+      skip_to_quote(Reader::State& s) {
+         return skip_to_nonescaped_char(s, '"');
+      }
+
+      template<typename Pred>
+      static bool
+      advance_while(Reader::State& s, Pred p) {
+         while (s.cur < s.bytes.end and p(*s.cur))
+            ++s.cur;
+         return s.cur < s.bytes.end;
+      }
+
+      // Return true if the character `c' be part of a non-absolute
+      // identifier.
+      static bool
+      identifier_part(Byte c) {
+         switch (c) {
+         case '+': case '-': case '*': case '/': case '%': case '^':
+         case '~': case '@': case '$': case '&': case '=':
+         case '<': case '>': case '?': case '!': case '_':
+         case '[': case ']': case '{': case '}':
+            return true;
+         default:
+            return isalnum(c);
+         }
+      }
+
+      // -- AtomSyntax --
+      AtomSyntax::AtomSyntax(const Lexeme& t) : lex(t) { }
+
+      // -- IntegerSyntax --
+      IntegerSyntax::IntegerSyntax(const Lexeme& t) : AtomSyntax(t) { }
+
+      void
+      IntegerSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- CharacterSyntax --
+      CharacterSyntax::CharacterSyntax(const Lexeme& t) : AtomSyntax(t) { }
+
+      void
+      CharacterSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- StringSyntax --
+      StringSyntax::StringSyntax(const Lexeme& t) : AtomSyntax(t) { }
+
+      void
+      StringSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- SymbolSyntax --
+      SymbolSyntax::SymbolSyntax(const Lexeme& t, Kind k)
+            : AtomSyntax(t), sort(k)
+      { }
+
+      void
+      SymbolSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- AnchorSyntax --
+      AnchorSyntax::AnchorSyntax(size_t t, const Syntax* s) : tag(t), val(s) { }
+
+      void
+      AnchorSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- ReferenceSyntax --
+       ReferenceSyntax::ReferenceSyntax(const Lexeme& t, Ordinal n)
+             : AtomSyntax(t), pos(n)
+      { }
+
+      void
+      ReferenceSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- QuoteSyntax --
+      QuoteSyntax::QuoteSyntax(const Syntax* s)
+            : unary_form<QuoteSyntax>(s)
+      { }
+
+      // -- AntiquoteSyntax --
+      AntiquoteSyntax::AntiquoteSyntax(const Syntax* s)
+            : unary_form<AntiquoteSyntax>(s)
+      { }
+
+      // -- Expand --
+      Expand::Expand(const Syntax* s) : unary_form<Expand>(s) { }
+
+      // -- Eval --
+      Eval::Eval(const Syntax* s) : unary_form<Eval>(s) { }
+
+      // -- Splice --
+      Splice::Splice(const Syntax* s) : unary_form<Splice>(s) { }
+
+      // -- Function --
+      Function::Function(const Syntax* s) : unary_form<Function>(s) { }
+
+      // -- Include --
+      Include::Include(const Syntax* c, const Syntax* s)
+            : binary_form<Include>(c, s)
+      { }
+
+      // -- Exclude --
+      Exclude::Exclude(const Syntax* c, const Syntax* s)
+            : binary_form<Exclude>(c, s)
+      { }
+
+      // -- ListSyntax --
+      ListSyntax::ListSyntax() : dot(false) { }
+
+      ListSyntax::ListSyntax(const base& elts, bool d)
+            : base(elts), dot(d)
+      { }
+
+      ListSyntax::~ListSyntax() { }
+
+      void
+      ListSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // -- VectorSyntax --
+      VectorSyntax::VectorSyntax() { }
+
+      VectorSyntax::VectorSyntax(const base& elts) : base(elts) { }
+
+      VectorSyntax::~VectorSyntax() { }
+      
+      void
+      VectorSyntax::accept(Visitor& v) const {
+         v.visit(*this);
+      }
+
+      // ---------------
+      // -- Allocator --
+      // ---------------
+      Allocator::Allocator() { }
+
+      // This destructor is defined here so that it provides
+      // a single instantiation point for destructors of all
+      // used templates floating around.
+      Allocator::~Allocator() { }
+
+      const CharacterSyntax*
+      Allocator::make_character(const Lexeme& t) {
+         return chars.make(t);
+      }
+
+      const IntegerSyntax*
+      Allocator::make_integer(const Lexeme& t) {
+         return ints.make(t);
+      }
+
+      const StringSyntax*
+      Allocator::make_string(const Lexeme& t) {
+         return strs.make(t);
+      }
+
+      const SymbolSyntax*
+      Allocator::make_symbol(SymbolSyntax::Kind k, const Lexeme& t) {
+         return syms.make(t, k);
+      }
+
+      const ReferenceSyntax*
+      Allocator::make_reference(size_t i, const Lexeme& t) {
+         return refs.make(t, i);
+      }
+
+      const AnchorSyntax*
+      Allocator::make_anchor(size_t t, const Syntax* s) {
+         return ancs.make(t, s);
+      }
+
+      const QuoteSyntax*
+      Allocator::make_quote(const Syntax* s) {
+         return quotes.make(s);
+      }
+
+      const AntiquoteSyntax*
+      Allocator::make_antiquote(const Syntax* s) {
+         return antis.make(s);
+      }
+
+      const Expand*
+      Allocator::make_expand(const Syntax* s) {
+         return exps.make(s);
+      }
+
+      const Eval*
+      Allocator::make_eval(const Syntax* s) {
+         return evls.make(s);
+      }
+
+      const Splice*
+      Allocator::make_splice(const Syntax* s) {
+         return spls.make(s);
+      }
+
+      const Function*
+      Allocator::make_function(const Syntax* s) {
+         return funs.make(s);
+      }
+
+      const Include*
+      Allocator::make_include(const Syntax* c, const Syntax* s) {
+         return incs.make(c, s);
+      }
+
+      const Exclude*
+      Allocator::make_exclude(const Syntax* c, const Syntax* s) {
+         return excs.make(c, s);
+      }
+
+      const ListSyntax*
+      Allocator::make_list(const std::vector<const Syntax*>& elts, bool dot) {
+         if (elts.empty())
+            return &empty_list;
+         return lists.make(elts, dot);
+      }
+
+      const VectorSyntax*
+      Allocator::make_vector(const std::vector<const Syntax*>& elts) {
+         if (elts.empty())
+            return &empty_vector;
+         return vectors.make(elts);
+      }
+
+      // The sequence of characters in [cur, last) consists
+      // entirely of digits.  Return the corresponding natural value.
+      static size_t
+      natural_value(const Byte* cur, const Byte* last) {
+         size_t n = 0;
+         for (; cur < last; ++cur)
+            // FIXME: check for overflow.
+            n = 10 * n + (*cur - '0');
+         return n;
+      }
+
+      // -- Reader --
+      Reader::Reader(const Byte* f, const Byte* l)
+            : st{ { f, l, 1 }, f, f }
+      { }
+
+      Reader::Reader(const RawInput& ri)
+            : st { ri, ri.start, ri.start }
+      { }
+
+      static const Syntax* read_sexpr(Reader::State&);
+
+      // Parse a string literal
+      static const Syntax*
+      read_string(Reader::State& s) {
+         auto start = s.cur++;
+         if (not skip_to_quote(s))
+            syntax_error("missing closing quote sign for string literal");
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_string(t);
+      }
+
+      // Parse an absolute identifier.
+      static const Syntax*
+      read_absolute_symbol(Reader::State& s) {
+         auto start = ++s.cur;
+         if (not skip_to_nonescaped_char(s, '|'))
+            syntax_error("missing closing bar sign for an absolute symbol");
+         Lexeme t = { { start, s.cur - 1 }, s.bytes.lineno };
+         return s.alloc.make_symbol(SymbolSyntax::absolute, t);
+      }
+
+      // Read an atom starting with digits.
+      static const Syntax*
+      read_maybe_natural(Reader::State& s) {
+         auto start = s.cur;
+         advance_while (s, isdigit);
+         if (s.cur >= s.bytes.end or is_delimiter(*s.cur)) {
+            Lexeme t = { { start, s.cur }, s.bytes.lineno };
+            return s.alloc.make_integer(t);
+         }
+         advance_while(s, identifier_part);
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_symbol(SymbolSyntax::ordinary, t);
+      }
+
+      // Read an identifier.
+      static const Syntax*
+      read_identifier(Reader::State& s) {
+         auto start = s.cur;
+         advance_while(s, identifier_part);
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_symbol(SymbolSyntax::ordinary, t);
+      }
+
+      // Read an atom starting with a '+' or '-' sign; this
+      // should be identifier, or a signed integer.
+      static const Syntax*
+      read_maybe_signed_number(Reader::State& s) {
+         auto start = s.cur++;
+         if (s.cur < s.bytes.end and isdigit(*s.cur)) {
+            advance_while(s, isdigit);
+            if (s.cur >= s.bytes.end or is_delimiter(*s.cur)) {
+               Lexeme t = { { start, s.cur }, s.bytes.lineno };
+               return s.alloc.make_integer(t);
+            }
+         }
+         advance_while(s, identifier_part);
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_symbol(SymbolSyntax::ordinary, t);
+      }
+
+      static const Syntax*
+      read_keyword(Reader::State& s) {
+         auto start = s.cur++;
+         advance_while(s, identifier_part);
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_symbol(SymbolSyntax::keyword, t);
+      }
+
+      // Read an atom.
+      static const Syntax*
+      read_atom(Reader::State& s) {
+         switch (*s.cur) {
+         case '"': return read_string(s);
+         case ':': return read_keyword(s);
+         case '-': case '+': return read_maybe_signed_number(s);
+
+         case '0': case '1': case '2': case '3': case '4':
+         case '5': case '6': case '7': case '8': case '9':
+            return read_maybe_natural(s);
+
+         default:
+            if (identifier_part(*s.cur))
+               return read_identifier(s);
+            invalid_character(s);
+            ++s.cur;
+            return nullptr;
+         }
+      }
+
+      // Parse a quote expression.
+      static const Syntax*
+      read_quote(Reader::State& s) {
+         ++s.cur;               // skip the quote character
+         auto x = read_sexpr(s);
+         if (x == nullptr)
+            syntax_error("end of input reached after quote sign");
+         return s.alloc.make_quote(x);
+      }
+
+      // Parse a backquote expression.
+      static const Syntax*
+      read_backquote(Reader::State& s) {
+         ++s.cur;               // skip the backquote character
+         auto x = read_sexpr(s);
+         if (x == nullptr)
+            syntax_error("end of input reached after backquote sign");
+         return s.alloc.make_antiquote(x);
+      }
+
+      // We've just seen "#(" indicating the start of a literal
+      // vector.  Read the elements and return the corresponding form.
+      static const Syntax*
+      finish_literal_vector(Reader::State& s) {
+         ++s.cur;               // Skip the open paren.
+         std::vector<const Syntax*> elts { };
+         while (skip_blank(s) and *s.cur != ')') {
+            if (auto x = read_sexpr(s))
+               elts.push_back(x);
+            else
+               syntax_error("syntax error while reading vector elements");
+         }
+         if (s.cur >= s.bytes.end)
+            syntax_error("unfinished literal vector");
+         else
+            ++s.cur;
+         return s.alloc.make_vector(elts);
+      }
+
+      // We've just seen the sharp sign followed by a digit.  We assume
+      // we are about to read an anchor or a back reference.
+      static const Syntax*
+      finish_anchor_or_reference(Reader::State& s) {
+         auto start = s.cur;
+         advance_while(s, isdigit);
+         if (s.cur >= s.bytes.end)
+            syntax_error("end-of-input after sharp-number sign");
+         const Byte c = *s.cur;
+         if (c != '#' and c != '=')
+            syntax_error("syntax error after sharp-number-equal sign");
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         auto n = natural_value(start, s.cur);
+         ++s.cur;
+         if (c == '#')
+            return s.alloc.make_reference(n, t);
+         auto x = read_sexpr(s);
+         if (x == nullptr)
+            syntax_error("syntax error after sharp-number-equal sign");
+         return s.alloc.make_anchor(n, x);
+      }
+
+      static const Syntax*
+      finish_function(Reader::State& s) {
+         ++s.cur;               // skip quote sign.
+         auto x = read_sexpr(s);
+         if (x == nullptr)
+            syntax_error("missing function designator after sharp-quote sign");
+         return s.alloc.make_function(x);
+      }
+
+      static const Syntax*
+      finish_uninterned_symbol(Reader::State& s) {
+         ++s.cur;               // skip colon sign.
+         auto start = s.cur;
+         advance_while(s, identifier_part);
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_symbol(SymbolSyntax::uninterned, t);
+      }
+
+      static const Syntax*
+      finish_readtime_eval(Reader::State& s) {
+         ++s.cur;               // skip dot sign.
+         auto x = read_sexpr(s);
+         if (x == nullptr)
+            syntax_error("parse error after sharp-dot sign");
+         return s.alloc.make_eval(x);
+      }
+
+      static const Syntax*
+      finish_character(Reader::State& s) {
+         ++s.cur;               // skip backslash sign
+         auto start = s.cur;
+         advance_while(s, identifier_part);
+         Lexeme t = { { start, s.cur }, s.bytes.lineno };
+         return s.alloc.make_character(t);
+      }
+
+      static const Syntax*
+      finish_include(Reader::State& s) {
+         ++s.cur;
+         auto cond = read_sexpr(s);
+         auto form = read_sexpr(s);
+         return s.alloc.make_include(cond, form);
+      }
+
+      static const Syntax*
+      finish_exclude(Reader::State& s) {
+         ++s.cur;
+         auto cond = read_sexpr(s);
+         auto form = read_sexpr(s);
+         return s.alloc.make_exclude(cond, form);
+      }
+
+      static const Syntax*
+      read_sharp_et_al(Reader::State& s) {
+         if (++s.cur >= s.bytes.end)
+            syntax_error("end-of-input reached after sharp sign");
+         switch (*s.cur) {
+         case '(':  return finish_literal_vector(s);
+         case '\'': return finish_function(s);
+         case ':': return finish_uninterned_symbol(s);
+         case '.': return finish_readtime_eval(s);
+         case '\\': return finish_character(s);
+         case '+': return finish_include(s);
+         case '-': return finish_exclude(s);
+
+         default:
+            if (isdigit(*s.cur))
+               return finish_anchor_or_reference(s);
+            syntax_error("syntax error after sharp-sign");
+         }
+         return nullptr;
+      }
+
+      // We have just seen a dot; read the tail and the closing parenthesis.
+      static const Syntax*
+      finish_dotted_list(Reader::State& s, std::vector<const Syntax*>& elts) {
+         ++s.cur;               // Skip dot sign.
+         auto x = read_sexpr(s);
+         if (x == nullptr)
+            syntax_error("missing expression after dot sign");
+         if (not skip_blank(s) or *s.cur != ')')
+            syntax_error("missing closing parenthesis");
+         ++s.cur;
+         elts.push_back(x);
+         return s.alloc.make_list(elts, true);
+      }
+
+      static const Syntax*
+      read_pair(Reader::State& s) {
+         ++s.cur;               // skip opening parenthesis
+         std::vector<const Syntax*> elts { };
+         while (skip_blank(s))
+            switch (*s.cur) {
+            case '.':
+               if (elts.empty())
+                  syntax_error("missing expression before dot sign.");
+               return finish_dotted_list(s, elts);
+
+            case ')':
+               ++s.cur;
+               return s.alloc.make_list(elts);
+
+            default:
+               if (auto x = read_sexpr(s))
+                  elts.push_back(x);
+               else
+                  syntax_error("unfinished pair expression");
+               break;
+            }
+         syntax_error("end-of-input while looking for closing parenthesis");
+         return nullptr;
+      }
+
+      static const Syntax*
+      read_sexpr(Reader::State& s) {
+         while (skip_blank(s))
+            switch (*s.cur) {
+            case ';': skip_to_eol(s); break;
+            case '\'': return read_quote(s);
+            case '`': return read_backquote(s);
+            case '|': return read_absolute_symbol(s);
+            case '#': return read_sharp_et_al(s);
+            case '(': return read_pair(s);
+            default: return read_atom(s);
+            }
+         return nullptr;
+      }
+
+      const Syntax*
+      Reader::read() {
+         return read_sexpr(st);
+      }
+
+      const Byte*
+      Reader::position(Ordinal p) {
+         st.cur = st.bytes.start + p;
+         st.line = st.cur;
+         // while (st.line > st.start and st.line[-1] != '\n')
+         //    --st.line;
+         return st.cur;
+      }
+
+   }
+}