diff options
Diffstat (limited to 'src/utils/sexpr.cc')
-rw-r--r-- | src/utils/sexpr.cc | 425 |
1 files changed, 336 insertions, 89 deletions
diff --git a/src/utils/sexpr.cc b/src/utils/sexpr.cc index 6ed2a964..69dc7b72 100644 --- a/src/utils/sexpr.cc +++ b/src/utils/sexpr.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2010, Gabriel Dos Reis. +// Copyright (C) 2010-2011, Gabriel Dos Reis. // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -38,9 +38,28 @@ namespace OpenAxiom { namespace Sexpr { + template<typename T, int N> + static inline int + length(const T(&)[N]) { + return N; + } + + template<typename Sequence> + static inline typename Sequence::const_pointer + begin_ptr(const Sequence& s) { + return &*s.begin(); + } + + template<typename Sequence> + static inline typename Sequence::const_pointer + end_ptr(const Sequence& s) { + return s.empty() ? 0 : &*s.begin() + s.size(); + } + std::ostream& operator<<(std::ostream& os, const Token& t) { switch (t.type) { + case Token::semicolon: os << "SEMICOLON"; break; case Token::dot: os << "DOT"; break; case Token::comma: os << "COMMA"; break; case Token::open_paren: os << "OPEN_PAREN"; break; @@ -51,6 +70,10 @@ namespace OpenAxiom { case Token::sharp_open_paren: os << "SHARP_OPEN_PAREN"; break; case Token::sharp_apostrophe: os << "SHARP_APOSTROPHE"; break; case Token::sharp_colon: os << "SHARP_COLON"; break; + case Token::sharp_plus: os << "SHARP_PLUS"; break; + case Token::sharp_minus: os << "SHARP_MINUS"; break; + case Token::sharp_dot: os << "SHARP_DOT"; break; + case Token::comma_at: os << "COMMA_AT"; break; case Token::integer: os << "INTEGER"; break; case Token::character: os << "CHARACTER"; break; case Token::string: os << "STRING"; break; @@ -73,6 +96,14 @@ namespace OpenAxiom { return os << ')'; } + // ----------- + // -- Lexer -- + // ----------- + static void + syntax_error(const std::string& s) { + throw BasicError(s); + } + // Return true if character `c' introduces a blank. static bool is_blank(char c) { @@ -85,7 +116,7 @@ namespace OpenAxiom { is_delimiter(char c) { return is_blank(c) or c == '(' or c == ')' or c == '\'' - or c == '`' or c == '\\' or c == '#'; + or c == '`' or c == '#'; } // Move `cur' past all consecutive blank characters, and @@ -97,11 +128,27 @@ namespace OpenAxiom { return cur; } + // Move `cur' to end-of-line marker. + static const char* + skip_to_eol(const char*& cur, const char* end) { + // FIXME: properly handle CR+LF. + while (cur < end and *cur != '\n') + ++cur; + return cur; + } + // Move `cur' until a word boundary is reached. static const char* skip_to_word_boundary(const char*& cur, const char* end) { - while (cur < end and not is_delimiter(*cur)) - ++cur; + bool saw_escape = false; + for (; cur < end; ++cur) { + if (saw_escape) + saw_escape = false; + else if (*cur == '\\') + saw_escape = true; + else if (is_delimiter(*cur)) + break; + } return cur; } @@ -109,21 +156,19 @@ namespace OpenAxiom { // Return true if the character was seen. static bool skip_to_nonescaped_char(const char*& cur, const char* end, char c) { + bool saw_escape = false; for (; cur < end; ++cur) - if (cur[0] == c and cur[-1] != '\\') { + if (saw_escape) + saw_escape = false; + else if (*cur == '\\') + saw_escape = true; + else if (*cur == c) { ++cur; return true; } return false; } - // Move `cur' past the closing fence of an absolute identifier. - // Return true if the closing fence was effectively seen. - static inline bool - skip_to_fence(const char*& cur, const char* end) { - return skip_to_nonescaped_char(cur, end, '|'); - } - // Move `cur' past the closing quote of string literal. // Return true if the closing fence was effectively seen. static inline bool @@ -137,8 +182,9 @@ namespace OpenAxiom { identifier_part(char c) { switch (c) { case '+': case '-': case '*': case '/': case '%': case '^': - case '~': case '@': case '$': case '&': case ':': case '=': + case '~': case '@': case '$': case '&': case '=': case '<': case '>': case '?': case '!': case '_': + case '[': case ']': case '{': case '}': return true; default: return isalnum(c); @@ -149,7 +195,8 @@ namespace OpenAxiom { // the sharp character. static bool special_after_sharp(char c) { - return c == '(' or c == '\'' or c == ':'; + return c == '(' or c == '\'' or c == ':' + or c == '+' or c == '-' or c == '.'; } // Return true if the sequence `[cur, end)' has a prefix that is @@ -175,73 +222,119 @@ namespace OpenAxiom { t.type = Token::integer; } + // Returns true if the first characters in the range + // [cur, last) start an identifier. + static bool + start_symbol(const char* cur, const char* last) { + if (cur >= last) + return false; + return identifier_part(*cur) + or *cur == '|' or *cur == ':'; + } + + // We are processing a symbol token. Accumulate all + // legitimate characters till the end of the token. + static void + skip_to_end_of_symbol(const char*& cur, const char* end) { + const char c = *cur; + if (*cur == '|') + skip_to_nonescaped_char(++cur, end, c); + else + skip_to_word_boundary(cur, end); + if (cur < end and *cur == ':') + skip_to_end_of_symbol(cur, end); + } + + static Token + match_maybe_symbol(Lexer* lexer, const char*& cur, const char* end) { + Token t = { Token::identifier, 0 }; + const char* start = cur; + skip_to_end_of_symbol(cur, end); + t.lexeme = lexer->intern(start, cur - start); + maybe_reclassify(t); + return t; + } + const char* Lexer::tokenize(const char* cur, const char* end) { while (skip_blank(cur, end) < end) { Token t = { Token::unknown, 0 }; switch (*cur) { - case '.': case ',': case '(': case ')': - case '\'': case '\\': + case ';': { + const char* start = cur; + t.type = Token::semicolon; + skip_to_eol(cur, end); + t.lexeme = intern(start, cur - start); + break; + } + + case '.': case '(': case ')': case '\'': case '`': t.type = Token::Type(OPENAXIOM_SEXPR_TOKEN1(*cur)); - t.lexeme = strings.intern(cur, 1); + t.lexeme = intern(cur, 1); ++cur; break; + case ',': { + const char* start = cur; + if (++cur < end and *cur == '@') { + t.type = Token::comma_at; + ++cur; + } + else + t.type = Token::comma; + t.lexeme = intern(start, cur - start); + break; + } + + case '\\': + t = match_maybe_symbol(this, cur, end); + break; + case '#': { const char* start = cur; if (cur + 1 < end and special_after_sharp(cur[1])) { t.type = Token::Type(OPENAXIOM_SEXPR_TOKEN2(cur[0], cur[1])); - t.lexeme = strings.intern(cur, 2); + t.lexeme = intern(cur, 2); cur += 2; } + else if (cur + 1 < end and cur[1] == '\\') { + start = cur += 2; + if (not isalnum(*cur)) + ++cur; + else + skip_to_word_boundary(cur, end); + t.type = Token::character; + t.lexeme = intern(start, cur - start); + } else if (only_digits_before_equal_or_shap(++cur, end)) { t.type = *cur == '#' ? Token::sharp_integer_sharp : Token::sharp_integer_equal; - t.lexeme = strings.intern(start, cur - start + 1); + t.lexeme = intern(start, cur - start + 1); ++cur; } - else if (cur + 1 < end and cur[1] == '\\') { - start = cur += 2; - skip_to_word_boundary(cur, end); - t.type = Token::character; - t.lexeme = strings.intern(start, cur - start); - } else { skip_to_word_boundary(cur, end); - t.lexeme = strings.intern(start, cur - start); + t.lexeme = intern(start, cur - start); } break; } - case '|': { - const char* start = cur; - skip_to_fence(++cur, end); - t.type = Token::identifier; - t.lexeme = strings.intern(start, cur - start); - break; - } - case '"': { const char* start = cur; skip_to_quote(++cur, end); t.type = Token::string; - t.lexeme = strings.intern(start, cur - start); + t.lexeme = intern(start, cur - start); break; } default: - if (identifier_part(*cur)) { - const char* start = cur; - skip_to_word_boundary(++cur, end); - t.type = Token::identifier; - t.lexeme = strings.intern(start, cur - start); - maybe_reclassify(t); - } + if (start_symbol(cur, end)) + t = match_maybe_symbol(this, cur, end); else { const char* start = cur; skip_to_word_boundary(++cur, end); - t.lexeme = strings.intern(start, cur - start); + t.lexeme = intern(start, cur - start); } break; } @@ -323,32 +416,45 @@ namespace OpenAxiom { // ----------- // -- Quote -- // ----------- - Quote::Quote(const Syntax* s) : form(s) { } + Quote::Quote(const Syntax* s) : unary_form<Quote>(s) { } - void - Quote::accept(Visitor& v) const { - v.visit(*this); - } + // --------------- + // -- Antiquote -- + // --------------- + Antiquote::Antiquote(const Syntax* s) : unary_form<Antiquote>(s) { } + + // ------------ + // -- Expand -- + // ------------ + Expand::Expand(const Syntax* s) : unary_form<Expand>(s) { } + + // ---------- + // -- Eval -- + // ---------- + Eval::Eval(const Syntax* s) : unary_form<Eval>(s) { } + + // ------------ + // -- Splice -- + // ------------ + Splice::Splice(const Syntax* s) : unary_form<Splice>(s) { } // -------------- // -- Function -- // -------------- - Function::Function(const Syntax* s) : form(s) { } + Function::Function(const Syntax* s) : unary_form<Function>(s) { } - void - Function::accept(Visitor& v) const { - v.visit(*this); - } + // ------------- + // -- Include -- + Include::Include(const Syntax* s) : unary_form<Include>(s) { } - // ---------- - // -- Pair -- - // ---------- - Pair::Pair(const Syntax* f, const Syntax* s) : elts(f, s) { } + // ------------- + // -- Exclude -- + Exclude::Exclude(const Syntax* s) : unary_form<Exclude>(s) { } - void - Pair::accept(Visitor& v) const { - v.visit(*this); - } + // ------------- + // -- DotTail -- + // ------------- + DotTail::DotTail(const Syntax* f) : unary_form<DotTail>(f) { } // ---------- // -- List -- @@ -459,14 +565,44 @@ namespace OpenAxiom { return quotes.allocate(s); } + const Antiquote* + Allocator::make_antiquote(const Syntax* s) { + return antis.allocate(s); + } + + const Expand* + Allocator::make_expand(const Syntax* s) { + return exps.allocate(s); + } + + const Eval* + Allocator::make_eval(const Syntax* s) { + return evls.allocate(s); + } + + const Splice* + Allocator::make_splice(const Syntax* s) { + return spls.allocate(s); + } + const Function* Allocator::make_function(const Syntax* s) { return funs.allocate(s); } - const Pair* - Allocator::make_pair(const Syntax* f, const Syntax* s) { - return pairs.allocate(f, s); + const Include* + Allocator::make_include(const Syntax* s) { + return incs.allocate(s); + } + + const Exclude* + Allocator::make_exclude(const Syntax* s) { + return excs.allocate(s); + } + + const DotTail* + Allocator::make_dot_tail(const Syntax* f) { + return tails.allocate(f); } const List* @@ -490,7 +626,7 @@ namespace OpenAxiom { // Signal a parse error static void parse_error(const std::string& s) { - throw SystemError(s); + throw BasicError(s); } // Signal that an expected syntax object was missing @@ -531,10 +667,34 @@ namespace OpenAxiom { return alloc.make_symbol(*cur++, kind); } + // List of lower case character names + static const char* charname[] = { + "newline", "space", "page", "tab", + "backspace", "return", "linefeed" + }; + + static bool + equal_character_name(BasicString lhs, const char* rhs) { + if (lhs->size() != strlen(rhs)) + return false; + for (const char* cur = lhs->begin(); cur != lhs->end(); ++cur) + if (tolower(*cur) != *rhs++) + return false; + return true; + } + + static bool + valid_character_name(BasicString s) { + for (int i = 0; i < length(charname); ++i) + if (equal_character_name(s, charname[i])) + return true; + return false; + } + const Character* Parser::parse_character(const Token*& cur, const Token* last) { - // NOTE: For the time being, accept only simple characters. - if (cur->lexeme->size() != 1) + if (cur->lexeme->size() != 1 + and not valid_character_name(cur->lexeme)) parse_error("invalid literal character syntax"); return alloc.make_character(*cur++); } @@ -582,11 +742,71 @@ namespace OpenAxiom { return alloc.make_quote(parse_syntax(cur, last)); } + // Parse an antiquotation + const Antiquote* + Parser::parse_antiquote(const Token*& cur, const Token* last) { + if (cur == last) + unexpected_end_of_input("backquote sign"); + return alloc.make_antiquote(parse_syntax(cur, last)); + } + + // Parse an expansion request form + const Expand* + Parser::parse_expand(const Token*& cur, const Token* last) { + const Syntax* s = parse_syntax(cur, last); + if (s == 0) + unexpected_end_of_input("comma sign"); + return alloc.make_expand(s); + } + + // Parse conditional inclusions + const Include* + Parser::parse_include(const Token*& cur, const Token* last) { + const Syntax* s = parse_syntax(cur, last); + if (s == 0) + unexpected_end_of_input("sharp-plus sign"); + return alloc.make_include(s); + } + + const Exclude* + Parser::parse_exclude(const Token*& cur, const Token* last) { + const Syntax* s = parse_syntax(cur, last); + if (s == 0) + unexpected_end_of_input("sharp-minus sign"); + return alloc.make_exclude(s); + } + + const Eval* + Parser::parse_eval(const Token*& cur, const Token* last) { + const Syntax* s = parse_syntax(cur, last); + if (s == 0) + unexpected_end_of_input("sharp-dot sign"); + return alloc.make_eval(s); + } + + const Splice* + Parser::parse_splice(const Token*& cur, const Token* last) { + const Syntax* s = parse_syntax(cur, last); + if (s == 0) + unexpected_end_of_input("comma-at sign"); + return alloc.make_splice(s); + } + + // Skip tokens that are semantically blanks, e.g. comments. + // Return true if not at end of tokens. + static bool + skip_ignorable_tokens(const Token*& cur, const Token* last) { + while (cur < last and cur->type == Token::semicolon) + ++cur; + return cur != last; + } + // Parse a vector of syntax objects: #(s .. s) const Vector* Parser::parse_vector(const Token*& cur, const Token* last) { std::vector<const Syntax*> elts; - while (cur < last and cur->type != Token::close_paren) + while (skip_ignorable_tokens(cur, last) + and cur->type != Token::close_paren) elts.push_back(parse_syntax(cur, last)); if (cur == last) missing_closer_for("vector"); @@ -595,31 +815,23 @@ namespace OpenAxiom { } // Constructs a pair or a list syntax object. - // This function is hairy for three reasons: (a) it is not known - // whether we list or a pair until after we have seen the - // enclosed tokens; (b) a dot is allowed at most once; (c) Lisp-style - // improper lists are not allowed. - const Syntax* - Parser::parse_list_or_pair(const Token*& cur, const Token* last) { + const List* + Parser::parse_list(const Token*& cur, const Token* last) { std::vector<const Syntax*> elts; - bool saw_dot = false; - while (cur < last and cur->type != Token::close_paren) { + while (skip_ignorable_tokens(cur, last) + and cur->type != Token::close_paren) { if (cur->type == Token::dot) { - if (elts.size() != 1) - parse_error("unexpected dot sign"); - saw_dot = true; - ++cur; - continue; + skip_ignorable_tokens(++cur, last); + if (const Syntax* s = parse_syntax(cur, last)) { + elts.push_back(alloc.make_dot_tail(s)); + break; + } } elts.push_back(parse_syntax(cur, last)); - if (saw_dot && elts.size() == 2) - break; } if (cur == last or cur->type != Token::close_paren) - missing_closer_for(saw_dot ? "pair" : "list"); + missing_closer_for("list"); ++cur; - if (saw_dot) - return alloc.make_pair(elts.front(), elts.back()); return alloc.make_list(elts); } @@ -628,6 +840,9 @@ namespace OpenAxiom { const Syntax* Parser::parse_syntax(const Token*& cur, const Token* last) { + if (not skip_ignorable_tokens(cur, last)) + return 0; + switch (cur->type) { case Token::integer: return alloc.make_integer(*cur++); @@ -660,7 +875,25 @@ namespace OpenAxiom { return parse_quote(++cur, last); case Token::open_paren: - return parse_list_or_pair(++cur, last); + return parse_list(++cur, last); + + case Token::sharp_plus: + return parse_include(++cur, last); + + case Token::sharp_minus: + return parse_exclude(++cur, last); + + case Token::sharp_dot: + return parse_eval(++cur, last); + + case Token::backquote: + return parse_antiquote(++cur, last); + + case Token::comma: + return parse_expand(++cur, last); + + case Token::comma_at: + return parse_splice(++cur, last); default: parse_error(std::string("parse error before ") @@ -671,9 +904,23 @@ namespace OpenAxiom { const Token* Parser::parse(const Token* cur, const Token* last) { - while (cur < last) - syns.push_back(parse_syntax(cur, last)); + while (cur < last) + if (const Syntax* s = parse_syntax(cur, last)) + syns.push_back(s); return cur; } + + Module::Module(const std::string& s) : nm(s) { + std::vector<Token> tokens; + Memory::FileMapping input(s); + Lexer lexer(raw_strs, tokens); + const char* rest = lexer.tokenize(input.begin(), input.end()); + if (rest != input.end()) + syntax_error("syntax error"); + Parser parser(allocator, *this); + const Token* tok = parser.parse(begin_ptr(tokens), end_ptr(tokens)); + if (tok != end_ptr(tokens)) + parse_error("parse error"); + } } } |