diff options
Diffstat (limited to 'src/utils/sexpr.H')
-rw-r--r-- | src/utils/sexpr.H | 385 |
1 files changed, 385 insertions, 0 deletions
diff --git a/src/utils/sexpr.H b/src/utils/sexpr.H new file mode 100644 index 00000000..5139b453 --- /dev/null +++ b/src/utils/sexpr.H @@ -0,0 +1,385 @@ +// Copyright (C) 2010, Gabriel Dos Reis. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// +// - Neither the name of The Numerical Algorithms Group Ltd. nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef OPENAXIOM_SEXPR_INCLUDED +#define OPENAXIOM_SEXPR_INCLUDED + +// --% Author: Gabriel Dos Reis. +// --% Description: +// --% A simple support for s-expressions. By design, no ambition +// --% for full-fledged Common Lisp reader capability. Rather, +// --% the aim is a simple data structure for exchanging data +// --% between several components of the OpenAxiom system. +// --% Users interested in fullblown Lisp syntax should seek +// --% to acquire Lisp systems, many of which are freely available. + +#include <iosfwd> +#include <vector> +#include <set> +#include "string-pool.H" + +// Helpers for defining token type values for lexeme with more +// than characters. +#define OPENAXIOM_SEXPR_TOKEN1(C) (C) +#define OPENAXIOM_SEXPR_TOKEN2(C1,C2) (C1 * 256 + C2) + +namespace OpenAxiom { + namespace Sexpr { + // ----------- + // -- Token -- + // ----------- + struct Token { + enum Type { + unknown, // unidentified token + dot = OPENAXIOM_SEXPR_TOKEN1('.'), // "." + comma = OPENAXIOM_SEXPR_TOKEN1(','), // "," + open_paren = OPENAXIOM_SEXPR_TOKEN1('('), // "(" + close_paren = OPENAXIOM_SEXPR_TOKEN1(')'), // ")" + apostrophe = OPENAXIOM_SEXPR_TOKEN1('\''), // "'" + backquote = OPENAXIOM_SEXPR_TOKEN1('`'), // "`" + backslash = OPENAXIOM_SEXPR_TOKEN1('\\'), // "\\" + sharp_open_paren = OPENAXIOM_SEXPR_TOKEN2('#','('), // "#(" + sharp_apostrophe = OPENAXIOM_SEXPR_TOKEN2('#','\''), // "#'" + sharp_colon = OPENAXIOM_SEXPR_TOKEN2('#',':'), // "#:" + digraph_end = OPENAXIOM_SEXPR_TOKEN2(256,256), + integer, // integer literal + string, // string literal + identifier, // plain identifier + sharp_integer_equal, // anchor definition, #n=<form> + sharp_integer_sharp // back reference, #n# + }; + + Type type; // class of this token + BasicString lexeme; // characters making up this token + }; + + // Print a token object on an output stream. + // Note: this function is for debugging purpose; in particular + // it does not `prettyprint' tokens. + std::ostream& operator<<(std::ostream&, const Token&); + + // ----------- + // -- Lexer -- + // ----------- + // An object of this type transforms a sequence of characters + // into a sequence of tokens as defined above. + // A lexer does not manage memory itself. Rather, it delegates + // storage allocation for lexemes and tokens to specialized + // agents used to construct it. + struct Lexer { + Lexer(StringPool& pool, std::vector<Token>& toks) + : strings(pool), tokens(toks) { } + + const char* tokenize(const char*, const char*); + + private: + StringPool& strings; // where to allocate lexemes from + std::vector<Token>& tokens; // where to deposite tokens. + }; + + // ------------ + // -- Syntax -- + // ------------ + struct Syntax { + struct Visitor; // base class of syntax visitors + virtual void accept(Visitor&) const = 0; + }; + + // ---------- + // -- Atom -- + // ---------- + // An atom is a syntax object consisting of exatly one token. + // This should not be confused with the notion of atom + // in Lisp languages. + struct Atom : Syntax { + const Token& token() const { return tok; } + BasicString lexeme() const { return tok.lexeme; } + void accept(Visitor&) const; + protected: + const Token tok; + Atom(const Token&); + }; + + // ------------- + // -- Integer -- + // ------------- + struct Integer : Atom { + explicit Integer(const Token&); + void accept(Visitor&) const; + }; + + // ------------ + // -- String -- + // ------------ + struct String : Atom { + explicit String(const Token&); + void accept(Visitor&) const; + }; + + // ------------ + // -- Symbol -- + // ------------ + struct Symbol : Atom { + enum Kind { + uninterned, // uninterned symbol + ordinary, // an interned symbol + keyword // a keyword symbol + }; + Symbol(const Token&, Kind); + Kind kin() const { return sort; } + void accept(Visitor&) const; + private: + const Kind sort; + }; + + // --------------- + // -- Reference -- + // --------------- + struct Reference : Atom { + Reference(const Token&, size_t); + size_t tag() const { return pos; } + void accept(Visitor&) const; + private: + const size_t pos; + }; + + // ------------ + // -- Anchor -- + // ------------ + struct Anchor : Syntax { + Anchor(size_t, const Syntax*); + size_t ref() const { return tag; } + const Syntax* value() const { return val; } + void accept(Visitor&) const; + private: + const size_t tag; + const Syntax* const val; + }; + + // ----------- + // -- Quote -- + // ----------- + struct Quote : Syntax { + explicit Quote(const Syntax*); + const Syntax* body() const { return form; } + void accept(Visitor&) const; + private: + const Syntax* const form; + }; + + // -------------- + // -- Function -- + // -------------- + struct Function : Syntax { + explicit Function(const Syntax*); + const Syntax* code() const { return form; } + void accept(Visitor&) const; + private: + const Syntax* const form; + }; + + // ---------- + // -- Pair -- + // ---------- + struct Pair : Syntax { + Pair(const Syntax*, const Syntax*); + const Syntax* first() const { return elts.first; } + const Syntax* second() const { return elts.second; } + void accept(Visitor&) const; + private: + const std::pair<const Syntax*, const Syntax*> elts; + }; + + // ---------- + // -- List -- + // ---------- + struct List : Syntax, private std::vector<const Syntax*> { + typedef std::vector<const Syntax*> base; + using base::const_iterator; + using base::begin; + using base::end; + using base::size; + using base::empty; + + List(); + explicit List(const base&); + ~List(); + void accept(Visitor&) const; + }; + + // ------------ + // -- Vector -- + // ------------ + struct Vector : Syntax, private std::vector<const Syntax*> { + typedef std::vector<const Syntax*> base; + using base::const_iterator; + using base::begin; + using base::end; + using base::size; + using base::operator[]; + using base::empty; + + Vector(); + explicit Vector(const base&); + ~Vector(); + void accept(Visitor&) const; + }; + + // --------------------- + // -- Syntax::Visitor -- + // --------------------- + struct Syntax::Visitor { + virtual void visit(const Atom&) = 0; + virtual void visit(const Integer&); + virtual void visit(const String&); + virtual void visit(const Symbol&); + virtual void visit(const Reference&); + virtual void visit(const Anchor&) = 0; + virtual void visit(const Quote&) = 0; + virtual void visit(const Function&) = 0; + virtual void visit(const Pair&) = 0; + virtual void visit(const List&) = 0; + virtual void visit(const Vector&) = 0; + }; + + // --------------- + // -- Allocator -- + // --------------- + + // The next two classes are helper classes for the main + // allocation class Allocator. We use std::set as allocator + // that guarantee uuniqueness of atomic syntax object with + // respect to the constituent token. That container needs + // a relational comparator. In an ideal world, this class + // should not exist. + struct SyntaxComparator { + bool operator()(const Atom& lhs, const Atom& rhs) const { + return std::less<BasicString>()(lhs.lexeme(), rhs.lexeme()); + } + + bool operator()(const Quote& lhs, const Quote& rhs) const { + return std::less<const void*>()(lhs.body(), rhs.body()); + } + + bool operator()(const Anchor& lhs, const Anchor& rhs) const { + return std::less<size_t>()(lhs.ref(), rhs.ref()); + } + + bool operator()(const Function& lhs, const Function& rhs) const { + return std::less<const void*>()(lhs.code(), rhs.code()); + } + + bool operator()(const Pair& lhs, const Pair& rhs) const { + std::less<const void*> cmp; + if (cmp(lhs.first(), rhs.first())) + return true; + if (cmp(rhs.first(), lhs.first())) + return false; + return cmp(lhs.second(), rhs.second()); + } + }; + + template<typename T> + struct UniqueAllocator : std::set<T, SyntaxComparator> { + typedef std::set<T, SyntaxComparator> base; + typedef typename base::const_iterator const_iterator; + + template<typename U> + const T* allocate(const U& u) { + return &*this->insert(T(u)).first; + } + + template<typename U, typename V> + const T* allocate(const U& u, const V& v) { + return &*this->insert(T(u, v)).first; + } + }; + + // Allocator of syntax objects. + struct Allocator { + Allocator(); + ~Allocator(); + + const Integer* make_integer(const Token&); + const String* make_string(const Token&); + const Symbol* make_symbol(const Token&, Symbol::Kind); + const Reference* make_reference(const Token&, size_t); + const Anchor* make_anchor(size_t, const Syntax*); + const Quote* make_quote(const Syntax*); + const Function* make_function(const Syntax*); + const Pair* make_pair(const Syntax*, const Syntax*); + const List* make_list(const std::vector<const Syntax*>&); + const Vector* make_vector(const std::vector<const Syntax*>&); + + private: + UniqueAllocator<Integer> ints; + UniqueAllocator<String> strs; + UniqueAllocator<Symbol> syms; + UniqueAllocator<Anchor> ancs; + UniqueAllocator<Reference> refs; + UniqueAllocator<Quote> quotes; + UniqueAllocator<Function> funs; + UniqueAllocator<Pair> pairs; + Memory::Arena<List> lists; + Memory::Arena<Vector> vectors; + List empty_list; + Vector empty_vector; + }; + + // ------------ + // -- Parser -- + // ------------ + // An object of this type transforms a sequence of tokens + // into a sequence of syntax objects. + // A parser object does not manage memory itself. Rather, it delegates + // storage allocation for syntax objects to specialized + // agents used to construct it. + struct Parser { + Parser(Allocator&, std::vector<const Syntax*>&); + const Token* parse(const Token*, const Token*); + private: + Allocator& alloc; + std::vector<const Syntax*>& syns; + + const Symbol* parse_symbol(const Token*&, const Token*); + const Anchor* parse_anchor(const Token*&, const Token*); + const Reference* parse_reference(const Token*&, const Token*); + const Symbol* parse_uninterned(const Token*&, const Token*); + const Function* parse_function(const Token*&, const Token*); + const Quote* parse_quote(const Token*&, const Token*); + const Vector* parse_vector(const Token*&, const Token*); + const Syntax* parse_list_or_pair(const Token*&, const Token*); + const Syntax* parse_syntax(const Token*&, const Token*); + }; + } +} + +#endif // OPENAXIOM_SEXPR_INCLUDED |