// -*- C++ -*- // Copyright (C) 2013-2014, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in // the documentation and/or other materials provided with the // distribution. // // - Neither the name of OpenAxiom. nor the names of its contributors // may be used to endorse or promote products derived from this // software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER // OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef OPENAXIOM_TOKEN_included #define OPENAXIOM_TOKEN_included #include #include #include #include #include namespace OpenAxiom { // Categorization of Boot and Spad tokens. enum class TokenCategory : uint8_t { Unclassified, // token of unknown class Whitespace, // sequence of white-space characters Comment, // a description of an ignorable comment Punctuator, // a punctuator character Operator, // an operator both symbolic and alphabetic Integer, // an integer literal FloatingPoint, // a floating-point literal String, // a string literal Keyword, // a reserved word both symbolic and alphabetic Identifier, // an identifier Formatting, // a layout formatting token Junk, // invalid/malformed token EOS // end-of-token-stream indicator }; std::ostream& operator<<(std::ostream&, TokenCategory); // The abstract value associated with a token. enum class TokenValue : uint8_t { #undef OPENAXIOM_DEFINE_TOKEN #define OPENAXIOM_DEFINE_TOKEN(T, ...) T, #include #undef OPENAXIOM_DEFINE_TOKEN EndOfStream // end of token stream }; std::ostream& operator<<(std::ostream&, TokenValue); // Datatypes for locating lines and columns. using LineNumber = std::size_t; using ColumnIndex = std::size_t; struct Locus { LineNumber line; ColumnIndex column; }; std::ostream& operator<<(std::ostream&, const Locus&); // Program text region struct Region { Locus start; Locus end; }; // Given a symbolic or alphabetic token, retrieve its category // and associated abstract value. struct TokenClassification { TokenCategory category; TokenValue value; explicit operator bool() const { return category != TokenCategory::EOS; } }; TokenClassification classify(const std::string&); // Token data structure: a region of text with a classification. struct Token : TokenClassification, Region { using Location = Locus; }; // Cursor into a fragment. struct FragmentCursor { std::size_t line; // index of a line in a fragment std::size_t column; // column number at line. inline FragmentCursor& operator++() { ++column; return *this; } inline FragmentCursor operator++(int) { auto tmp = *this; ++*this; return tmp; } inline FragmentCursor& operator--() { --column; return *this; } inline FragmentCursor operator--(int) { auto tmp = *this; --*this; return tmp; } }; // -- Exception types struct EndOfStringUnseen { LineNumber line; ColumnIndex column; }; struct MissingExponent { LineNumber line; ColumnIndex column; }; // Object of this datatype decompose a program fragment into a // token stream. The tokens are of type indicated by Tok. template struct Tokenizer { Tokenizer(Frag& f) : frag(f), pos{ 0, frag.front().indent } { indents.push(pos.column); } bool eos() const { return pos.line >= frag.size() or (pos.line + 1 == frag.size() and pos.column >= frag.back().size()); } Tok get(Language = Language::Spad); private: Frag& frag; FragmentCursor pos; std::stack indents; std::size_t line_length() const { return frag(pos).size(); } LineNumber next_line_number() const { return pos.line + 1 < frag.size() ? frag[pos.line + 1].number : frag.back().number + 1; } ColumnIndex next_indentation() const { return pos.line + 1 < frag.size() ? frag[pos.line + 1].indent : 0; } LineNumber line_number() const { return pos.line < frag.size() ? frag(pos).number : frag.back().number + 1; } ColumnIndex column_number() const { return pos.line < frag.size() ? pos.column : 0; } using Locus = typename Tok::Location; Locus current_locus() { return { line_number(), column_number() }; } }; bool separator_or_punctuator(uint8_t); template inline void comment_token(T& t, TokenValue v) { t.category = TokenCategory::Comment; t.value = v; } template inline void operator_token(T& t, TokenValue v) { t.category = TokenCategory::Operator; t.value = v; } template inline void punctuator_token(T& t, TokenValue v) { t.category = TokenCategory::Punctuator; t.value = v; } template static void junk(L& line, ColumnIndex& idx, T& t) { while (idx < line.size() and not separator_or_punctuator(line[idx])) ++idx; t.category = TokenCategory::Junk; } template inline void skip_whitespace(L& line, ColumnIndex& idx) { while (idx < line.size() and isblank(line[idx])) ++idx; } template void string_literal(Frag& frag, FragmentCursor& pos, Tok& t) { bool done = false; bool escape = false; while (frag.covering(pos) && not done) { switch (frag(pos)[pos.column++]) { case '"': done = !escape; // fallthrough default: escape = false; break; case '_': if (pos.column == frag(pos).size() and pos.line < frag.size() - 1) { ++pos.line; pos.column = 0; } else escape = !escape; break; } } if (not done) throw EndOfStringUnseen{ frag(pos).number, pos.column }; t.category = TokenCategory::String; } template void skip_to_end_of_integer(L& line, ColumnIndex& idx) { while (idx < line.size() and isdigit(line[idx])) ++idx; } template void integer(L& line, ColumnIndex& idx, T& t) { skip_to_end_of_integer(line, idx); t.category = TokenCategory::Integer; } template T& number(L& line, ColumnIndex& idx, T& t) { integer(line, idx, t); if (idx >= line.size() or line[idx] != '.') return t; if (++idx >= line.size() or not isdigit(line[idx])) { --idx; return t; } t.category = TokenCategory::FloatingPoint; skip_to_end_of_integer(line, idx); if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E')) return t; if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-')) ++idx; if (idx >= line.size() or not isdigit(line[idx])) throw MissingExponent{ line.number, idx }; skip_to_end_of_integer(line, idx); return t; } inline bool identifier_head(uint8_t c) { return isalpha(c) or c == '%' or c == '_'; } inline bool identifier_part(uint8_t c) { return identifier_head(c) or isdigit(c); } inline bool identifier_suffix(uint8_t c) { return c == '!' or c == '?'; } inline bool internal_prefix(uint8_t c) { return c == '%' or c == '$'; } template inline void skip_prefix(L& line, ColumnIndex& idx, uint8_t c) { while (idx < line.size() and line[idx] == c) ++idx; } template T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) { t.category = TokenCategory::Identifier; ColumnIndex start = --idx; // idx was ahead by 1. if (dialect == Language::Boot and internal_prefix(line[idx])) skip_prefix(line, idx, line[idx]); bool saw_escape = false; while (idx < line.size()) { if (not identifier_part(line[idx]) and line[idx - 1] != '_') break; else if (line[idx] == '_') saw_escape = true; ++idx; } while (idx < line.size() and identifier_suffix(line[idx])) ++idx; if (!saw_escape) { auto info = classify(line.sub_string(start, idx)); if (info.category != TokenCategory::Identifier) { t.category = info.category; t.value = info.value; } } return t; } template static void left_paren_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { punctuator_token(t, TokenValue::OpenParen); if (frag.covering(pos) and frag[pos] == '|') { ++pos; t.value = TokenValue::OpenMetaParen; } } template static void left_brace_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { punctuator_token(t, TokenValue::OpenBrace); if (frag.covering(pos) and frag[pos] == '|') { ++pos; t.value = TokenValue::OpenMetaBrace; } } template static void left_bracket_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { punctuator_token(t, TokenValue::OpenBracket); if (frag.covering(pos) and frag[pos] == '|') { ++pos; t.value = TokenValue::OpenMetaBracket; } } template static void colon_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Colon); if (frag.covering(pos)) switch (frag[pos]) { case ':': t.value = TokenValue::ColonColon; ++pos; break; case '=': t.value = TokenValue::ColonEq; ++pos; break; case '-': t.value = TokenValue::ColonDash; ++pos; break; default: break; } } template static void star_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Star); if (frag.covering(pos) and frag[pos] == '*') { t.value = TokenValue::StarStar; ++pos; } } template static void slash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Slash); if (frag.covering(pos)) switch (frag[pos]) { case '/': t.value = TokenValue::SlashSlash; ++pos; break; case '\\': t.value = TokenValue::SlashBackslash; ++pos; break; default: break; } } template static void backslash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Backslash); if (frag.covering(pos)) switch (frag[pos]) { case '\\': t.value = TokenValue::BackslashBackslash; ++pos; break; case '/': t.value = TokenValue::BackslashSlash; ++pos; break; default: break; } } template static void less_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Less); if (frag.covering(pos)) switch (frag[pos]) { case '-': t.value = TokenValue::LeftArrow; ++pos; break; case '<': t.value = TokenValue::OpenChevron; ++pos; break; case '=': t.value = TokenValue::LessEq; if (frag.covering(++pos) and frag[pos] == '>') { t.value = TokenValue::Equiv; ++pos; } break; default: break; } } template static void equal_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Eq); if (frag.covering(pos)) switch (frag[pos]) { case '>': t.value = TokenValue::Implies; ++pos; break; case '=': t.value = TokenValue::EqEq; if (frag.covering(++pos) and frag[pos] == '>') { t.value = TokenValue::FatArrow; ++pos; } break; default: break; } } template static void tilde_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Tilde); if (frag.covering(pos) and frag[pos] == '=') { t.value = TokenValue::TildeEq; ++pos; } } template static void greater_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Greater); if (frag.covering(pos)) switch (frag[pos]) { case '=': t.value = TokenValue::GreaterEq; ++pos; break; case '>': t.value = TokenValue::CloseChevron; ++pos; break; } } template static void bar_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { punctuator_token(t, TokenValue::Bar); if (frag.covering(pos)) switch (frag[pos]) { case ']': t.value = TokenValue::CloseMetaBracket; ++pos; break; case '}': t.value = TokenValue::CloseMetaBrace; ++pos; break; case ')': t.value = TokenValue::CloseMetaParen; ++pos; break; default: break; } } template static void minus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Minus); if (frag.covering(pos)) switch (frag[pos]) { case '>': t.value = TokenValue::RightArrow; ++pos; break; case '-': comment_token(t, TokenValue::Wisecrack); pos.column = frag(pos).size(); break; } } template static void plus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Plus); if (frag.covering(pos)) switch (frag[pos]) { case '+': comment_token(t, TokenValue::Commentary); pos.column = frag(pos).size(); break; case '-': if (pos.column + 1 < frag(pos).size() and frag(pos)[pos.column + 1] == '>') { t.value = TokenValue::MapsTo; pos.column += 2; } break; default: break; } } template static void dot_et_al(Frag& frag, FragmentCursor& pos, Tok& t) { operator_token(t, TokenValue::Dot); if (frag.covering(pos) and frag[pos] == '.') { t.value = TokenValue::DotDot; ++pos; } } template static void dollar_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) { if (dialect != Language::Boot or not frag.covering(pos) or separator_or_punctuator(frag[pos])) operator_token(t, TokenValue::Dollar); else identifier(frag(pos), pos.column, t, dialect); } template static void sharp_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) { if (dialect != Language::Lisp) operator_token(t, TokenValue::Sharp); else if (frag.covering(pos)) switch (frag[pos++]) { case '(': punctuator_token(t, TokenValue::SharpLeftParen); break; case '\'': operator_token(t, TokenValue::SharpApostrophe); break; case ':': operator_token(t, TokenValue::SharpColon); break; case '+': punctuator_token(t, TokenValue::SharpPlus); break; case '-': punctuator_token(t, TokenValue::SharpMinus); break; case '.': operator_token(t, TokenValue::SharpDot); break; default: --pos; break; } } template Tok Tokenizer::get(Language dialect) { Tok t { }; t.start = current_locus(); if (eos()) { t.category = TokenCategory::EOS; t.end = current_locus(); return t; } else if (isblank(frag[pos])) { skip_whitespace(frag(pos), pos.column); t.category = TokenCategory::Whitespace; t.end = current_locus(); return t; } else if (pos.column == line_length() - 1 and frag(pos).back() == '_') { ++pos.line; pos.column = frag(pos).indent; } else if (pos.column == line_length()) { auto indent = indents.top(); auto next_indent = next_indentation(); t.start = t.end = { next_line_number(), next_indent }; if (indent < next_indent) { indents.push(next_indent); ++pos.line; pos.column = next_indent; t.category = TokenCategory::Formatting; t.value = TokenValue::Indent; } else if (indent > next_indent) { indents.pop(); t.category = TokenCategory::Formatting; t.value = TokenValue::Unindent; } else { ++pos.line; pos.column = next_indent; t.category = TokenCategory::Formatting; t.value = TokenValue::Justify; } return t; } switch (auto c = frag.advance(pos)) { case '#': sharp_et_al(frag, pos, t, dialect); break; case '@': operator_token(t, TokenValue::At); break; case '^': operator_token(t, TokenValue::Caret); break; case '&': punctuator_token(t, TokenValue::Ampersand); break; case '!': punctuator_token(t, TokenValue::Exclamation); break; case '\'': punctuator_token(t, TokenValue::Apostrophe); break; case ',': punctuator_token(t, TokenValue::Comma); break; case ';': punctuator_token(t, TokenValue::Semicolon); break; case '`': punctuator_token(t, TokenValue::Backquote); break; case '(': left_paren_et_al(frag, pos, t); break; case ')': punctuator_token(t, TokenValue::CloseParen); break; case '{': left_brace_et_al(frag, pos, t); break; case '}': punctuator_token(t, TokenValue::CloseBrace); break; case '[': left_bracket_et_al(frag, pos, t); break; case ']': punctuator_token(t, TokenValue::CloseBracket); break; case ':': colon_et_al(frag, pos, t); break; case '*': star_et_al(frag, pos, t); break; case '/': slash_et_al(frag, pos, t); break; case '\\': backslash_et_al(frag, pos, t); break; case '<': less_et_al(frag, pos, t); break; case '=': equal_et_al(frag, pos, t); break; case '~': tilde_et_al(frag, pos, t); break; case '>': greater_et_al(frag, pos, t); break; case '|': bar_et_al(frag, pos, t); break; case '-': minus_et_al(frag, pos, t); break; case '+': plus_et_al(frag, pos, t); break; case '.': dot_et_al(frag, pos, t); break; case '"': string_literal(frag, pos, t); break; case '$': dollar_et_al(frag, pos, t, dialect); break; default: if (isdigit(c)) number(frag(pos), pos.column, t); else if (identifier_head(c)) identifier(frag(pos), pos.column, t, dialect); else junk(frag(pos), pos.column, t); break; } t.end = { frag(pos).number, pos.column }; return t; } // -- Token streams. template struct TokenStream : std::vector { template explicit TokenStream(Frag& f, Language dialect = Language::Spad) { Tokenizer lex { f }; while (auto t = lex.get(dialect)) this->push_back(t); } }; } #endif // OPENAXIOM_TOKEN_included