diff options
Diffstat (limited to 'src/include/token.H')
-rw-r--r-- | src/include/token.H | 691 |
1 files changed, 548 insertions, 143 deletions
diff --git a/src/include/token.H b/src/include/token.H index ef203b12..3b3b2950 100644 --- a/src/include/token.H +++ b/src/include/token.H @@ -1,4 +1,4 @@ -// Copyright (C) 2013, Gabriel Dos Reis. +// Copyright (C) 2013-2014, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -34,151 +34,556 @@ #define OPENAXIOM_TOKEN_included #include <stdint.h> +#include <stack> +#include <iosfwd> #include <open-axiom/Input> +#include <open-axiom/dialect> namespace OpenAxiom { - namespace token { - // -- Underlying representation of a token class. - using base_type = uint32_t; - - // -- 8-bit byte data type - using u8 = uint8_t; - - constexpr base_type value(u8 c) { return c; } - constexpr base_type value(u8 hi, u8 lo) { return (hi << 8) | lo; } - constexpr base_type value(u8 hi, u8 mi, u8 lo) { - return (value(hi, mi) << 8) | lo; - } - - // -- Type of literal strings of given number of characters. - template<int N> - using text_chunk = const char(&)[N+1]; - - // -- Return the token value of certain literal strings. - constexpr base_type value(text_chunk<0>) { return u8(); } - constexpr base_type value(text_chunk<1> s) { - return value(s[0]); - } - constexpr base_type value(text_chunk<2> s) { - return value(s[0], s[1]); - } - constexpr base_type value(text_chunk<3> s) { - return value(s[0], s[1], s[2]); - } - - // -- Abstract values of tokens. - enum Value : base_type { - Unknown = value(""), - Bar = value("|"), - Dot = value("."), - DotDot = value(".."), - Colon = value(":"), - ColonColon = value("::"), - ColonDash = value(":-"), - ColonEq = value(":="), - At = value("@"), - Comma = value(","), - Semicolon = value(";"), - Star = value("*"), - Plus = value("+"), - Minus = value("-"), - Slash = value("/"), - Backslash = value("\\"), - SlashSlash = value("//"), - BackslashBackslash = value("\\\\"), - BackslashSlash = value("\\/"), - SlashBackslash = value("/\\"), - Less = value("<"), - LessEq = value("<="), - Greater = value(">"), - GreaterEq = value(">="), - Eq = value("="), - EqEq = value("=="), - Tilde = value("~"), - TildeEq = value("~="), - Caret = value("^"), - Pound = value("#"), - Dollar = value("$"), - Ampersand = value("&"), - OpenParen = value("("), - CloseParen = value(")"), - OpenBracket = value("["), - CloseBracket = value("]"), - OpenBrace = value("{"), - CloseBrace = value("}"), - OpenMetParen = value("(|"), - CloseMetaParen = value("|)"), - OpenMetaBracket = value("[|"), - CloseMetaBracket = value("|]"), - OpenMetaBrace = value("{|"), - CloseMetaBrace = value("|}"), - Apostrophe = value("'"), - Backquote = value("`"), - StarStar = value("**"), - Implies = value("=>"), - RightArrow = value("->"), - LeftArrow = value("<-"), - OpenChevron = value("<<"), - CloseChevron = value(">>"), - FatArrow = value("==>"), - Equiv = value("<=>"), - MapsTo = value("+->"), - - Add = value("add"), - And = value("and"), - By = value("by"), - Do = value("do"), - For = value("for"), - Has = value("has"), - If = value("if"), - In = value("in"), - Is = value("is"), - Mod = value("mod"), - Of = value("of"), // -- Boot only - Or = value("or"), - Quo = value("quo"), - Rem = value("rem"), - Try = value("try"), - LastTrigraph = 0xffffff, - - Assume, // "assume" - Break, // "break" - Case, // "case" - Catch, // "catch" - Cross, // "cross" - Else, // "else" - Exists, // "exists" - Finally, // "finally" - From, // "from" - Forall, // "forall" - Function, // "function" -- Boot only - Import, // "import" - Inline, // "inline" - Isnt, // "isnt" - Iterate, // "iterate" - Leave, // "leave" - Macro, // "macro" - Module, // "module" -- Boot only - Namespace, // "namespace" -- Boot only - Pretend, // "pretend" - Repeat, // "repeat" - Return, // "return" - Rule, // "rule" - Structure, // "structure" -- Boot only - Then, // "then" - Throw, // "throw" - Until, // "until" - With, // "with" - Where, // "where" - While, // "while" - - IntegerLiteral, // integer literal - StringLiteral, // string literal - FPLiteral, // floating point literal - Indent, // new line indentation, greater than previous - Unindent, // new line indentation, less than previous - Justify, // align indentation with preceding line. - }; + // Categorization of Boot and Spad tokens. + enum class TokenCategory : uint8_t { + Unclassified, // token of unknown class + Whitespace, // sequence of white-space characters + Comment, // a description of an ignorable comment + Punctuator, // a punctuator character + Operator, // an operator both symbolic and alphabetic + Integer, // an integer literal + FloatingPoint, // a floating-point literal + String, // a string literal + Keyword, // a reserved word both symbolic and alphabetic + Identifier, // an identifier + Formatting, // a layout formatting token + Junk, // invalid/malformed token + EOS // end-of-token-stream indicator + }; + + std::ostream& operator<<(std::ostream&, TokenCategory); + + // The abstract value associated with a token. + enum class TokenValue : uint8_t { +#undef OPENAXIOM_DEFINE_TOKEN +#define OPENAXIOM_DEFINE_TOKEN(T, ...) T, +#include <open-axiom/token-value> +#undef OPENAXIOM_DEFINE_TOKEN + Artificial, // Tokens after this are artificial + Indent, // new line indentation, greater than previous + Unindent, // new line indentation, less than previous + Justify, // align indentation with preceding line. + + EndOfStream // end of token stream + }; + + std::ostream& operator<<(std::ostream&, TokenValue); + + // Given a symbolic or alphabetic token, retrieve its category + // and associated abstract value. + struct TokenClassification { + TokenCategory category; + TokenValue value; + + explicit operator bool() const { + return category != TokenCategory::Unclassified; + } + }; + + TokenClassification classify(const std::string&); + + // Datatypes for locating lines and columns. + using LineNumber = std::size_t; + using ColumnIndex = std::size_t; + + // -- Exception types + struct EndOfStringUnseen { + LineNumber line; + ColumnIndex column; + }; + + struct MissingExponent { + LineNumber line; + ColumnIndex column; + }; + + // Object of this datatype decompose a program fragment into a + // token stream. The tokens are of type indicated by Tok. + template<typename Frag, typename Tok> + struct TokenStream { + TokenStream(Frag& f) + : frag(f), + line(), + idx(frag.front().indent) + { + indents.push(idx); + } + + bool eos() const { + return line >= frag.size() + or (line + 1 == frag.size() and idx >= frag.back().size()); + } + + Tok get(Language = Language::Spad); + private: + Frag& frag; + std::size_t line; + std::size_t idx; + std::stack<ColumnIndex> indents; + + std::size_t line_length() const { return frag[line].size(); } + LineNumber next_line_number() const { + return line + 1 < frag.size() + ? frag[line + 1].number + : frag.back().number + 1; + } + ColumnIndex next_indentation() const { + return line + 1 < frag.size() ? frag[line + 1].indent : 0; + } + + LineNumber line_number() const { + return line < frag.size() + ? frag[line].number + : frag.back().number + 1; + } + + ColumnIndex column_number() const { + return line < frag.size() ? idx : 0; + } + + using Locus = typename Tok::Location; + Locus current_locus() { + return { line_number(), column_number() }; + } + }; + + bool separator_or_punctuator(uint8_t); + + template<typename L, typename T> + static void junk(L& line, ColumnIndex& idx, T& t) { + while (idx < line.size() and not separator_or_punctuator(line[idx])) + ++idx; + t.category = TokenCategory::Junk; + } + + template<typename L> + inline void + skip_whitespace(L& line, ColumnIndex& idx) { + while (idx < line.size() and isspace(line[idx])) + ++idx; + } + + template<typename L, typename T> + void string(L& line, ColumnIndex& idx, T& t) { + bool done = false; + bool escape = false; + while (idx < line.size() && not done) { + switch (line[idx++]) { + case '_': escape = !escape; break; + case '"': done = !escape; + // fallthrough + default: escape = false; break; + } + } + if (not done) + throw EndOfStringUnseen{ line.number, idx }; + t.category = TokenCategory::String; + } + + template<typename L> + void skip_to_end_of_integer(L& line, ColumnIndex& idx) { + while (idx < line.size() and isdigit(line[idx])) + ++idx; + } + + template<typename L, typename T> + void integer(L& line, ColumnIndex& idx, T& t) { + skip_to_end_of_integer(line, idx); + t.category = TokenCategory::Integer; + } + + template<typename L, typename T> + T& number(L& line, ColumnIndex& idx, T& t) { + integer(line, idx, t); + if (idx >= line.size() or line[idx] != '.') + return t; + if (++idx >= line.size() or not isdigit(line[idx])) { + --idx; + return t; + } + + t.category = TokenCategory::FloatingPoint; + skip_to_end_of_integer(line, idx); + if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E')) + return t; + if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-')) + ++idx; + if (idx >= line.size() or not isdigit(line[idx])) + throw MissingExponent{ line.number, idx }; + skip_to_end_of_integer(line, idx); + return t; + } + + inline bool + identifier_head(uint8_t c) { + return isalpha(c) or c == '%' or c == '_'; + } + + inline bool + identifier_part(uint8_t c) { + return identifier_head(c) or isdigit(c); + } + + inline bool + identifier_suffix(uint8_t c) { + return c == '!' or c == '?'; + } + + inline bool internal_prefix(uint8_t c) { + return c == '%' or c == '$'; + } + + template<typename L> + inline void + skip_prefix(L& line, ColumnIndex& idx, uint8_t c) { + while (idx < line.size() and line[idx] == c) + ++idx; + } + + template<typename L, typename T> + T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) { + t.category = TokenCategory::Identifier; + + ColumnIndex start = --idx; // idx was ahead by 1. + if (dialect == Language::Boot and internal_prefix(line[idx])) + skip_prefix(line, idx, line[idx]); + bool saw_escape = false; + while (idx < line.size()) { + if (not identifier_part(line[idx]) and line[idx - 1] != '_') + break; + else if (line[idx] == '_') + saw_escape = true; + ++idx; + } + while (idx < line.size() and identifier_suffix(line[idx])) + ++idx; + + if (saw_escape) + t.category = TokenCategory::Identifier; + else if (auto info = classify(line.sub_string(start, idx))) { + t.category = info.category; + t.value = info.value; + } + return t; + } + + template<typename Frag, typename Tok> + Tok TokenStream<Frag, Tok>::get(Language dialect) { + Tok t { }; + t.start = current_locus(); + + if (eos()) { + t.category = TokenCategory::EOS; + t.end = current_locus(); + return t; + } + else if (isspace(frag[line][idx])) { + skip_whitespace(frag[line], idx); + t.category = TokenCategory::Whitespace; + t.end = current_locus(); + return t; + } + else if (idx == line_length() - 1 and frag[line].back() == '_') { + ++line; + idx = frag[line].indent; + } + else if (idx == line_length()) { + auto indent = indents.top(); + auto next_indent = next_indentation(); + t.start = t.end = { next_line_number(), next_indent }; + if (indent < next_indent) { + indents.push(next_indent); + ++line; + idx = next_indent; + t.category = TokenCategory::Formatting; + t.value = TokenValue::Indent; + } + else if (indent > next_indent) { + indents.pop(); + t.category = TokenCategory::Formatting; + t.value = TokenValue::Unindent; + } + else { + ++line; + idx = next_indent; + t.category = TokenCategory::Formatting; + t.value = TokenValue::Justify; + } + return t; + } + + switch (auto c = frag[line][idx++]) { + case '#': + t.category = TokenCategory::Operator; + t.value = TokenValue::Pound; + break; + + case '@': + t.category = TokenCategory::Operator; + t.value = TokenValue::At; + break; + + case '^': + t.category = TokenCategory::Operator; + t.value = TokenValue::Caret; + break; + + case '&': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Ampersand; + break; + + case '!': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Exclamation; + break; + + case '\'': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Apostrophe; + break; + case ',': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Comma; + break; + + case ';': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Semicolon; + break; + + case '`': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Backquote; + break; + + case '(': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::OpenParen; + if (idx < line_length() and frag[line][idx] == '|') { + ++idx; + t.value = TokenValue::OpenMetaParen; + } + break; + + case ')': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::CloseParen; + break; + + case '{': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::OpenBrace; + if (idx < line_length() and frag[line][idx] == '|') { + ++idx; + t.value = TokenValue::OpenMetaBrace; + } + break; + + case '}': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::CloseBrace; + break; + + case '[': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::OpenBracket; + if (idx < line_length() and frag[line][idx] == '|') { + ++idx; + t.value = TokenValue::OpenMetaBracket; + } + break; + + case ']': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::CloseBracket; + break; + + case ':': + t.category = TokenCategory::Operator; + t.value = TokenValue::Colon; + if (idx < line_length()) + switch (frag[line][idx]) { + case ':': t.value = TokenValue::ColonColon; ++idx; break; + case '=': t.value = TokenValue::ColonEq; ++idx; break; + case '-': t.value = TokenValue::ColonDash; ++idx; break; + default: break; + } + break; + + case '*': + t.category = TokenCategory::Operator; + t.value = TokenValue::Star; + if (idx < line_length() and frag[line][idx] == '*') { + t.value = TokenValue::StarStar; + ++idx; + } + break; + + case '/': + t.category = TokenCategory::Operator; + t.value = TokenValue::Slash; + if (idx < line_length()) + switch (frag[line][idx]) { + case '/': t.value = TokenValue::SlashSlash; ++idx; break; + case '\\': t.value = TokenValue::SlashBackslash; ++idx; break; + default: break; + } + break; + + case '\\': + t.category = TokenCategory::Operator; + t.value = TokenValue::Backslash; + if (idx < line_length()) + switch (frag[line][idx]) { + case '\\': t.value = TokenValue::BackslashBackslash; ++idx; break; + case '/': t.value = TokenValue::BackslashSlash; ++idx; break; + default: break; + } + break; + + case '<': + t.category = TokenCategory::Operator; + t.value = TokenValue::Less; + if (idx < line_length()) + switch (frag[line][idx]) { + case '-': t.value = TokenValue::LeftArrow; ++idx; break; + case '<': t.value = TokenValue::OpenChevron; ++idx; break; + case '=': + t.value = TokenValue::LessEq; + if (++idx < line_length() and frag[line][idx] == '>') { + t.value = TokenValue::Equiv; + ++idx; + } + break; + default: break; + } + break; + + case '=': + t.category = TokenCategory::Operator; + t.value = TokenValue::Eq; + if (idx < line_length()) + switch (frag[line][idx]) { + case '>': t.value = TokenValue::Implies; ++idx; break; + case '=': + t.value = TokenValue::EqEq; + if (++idx < line_length() and frag[line][idx] == '>') { + t.value = TokenValue::FatArrow; + ++idx; + } + break; + default: break; + } + break; + + case '~': + t.category = TokenCategory::Operator; + t.value = TokenValue::Tilde; + if (idx < line_length() and frag[line][idx] == '=') { + t.value = TokenValue::TildeEq; + ++idx; + } + break; + + case '>': + t.category = TokenCategory::Operator; + t.value = TokenValue::Greater; + if (idx < line_length()) + switch (frag[line][idx]) { + case '=': t.value = TokenValue::GreaterEq; ++idx; break; + case '>': t.value = TokenValue::CloseChevron; ++idx; break; + } + break; + + case '|': + t.category = TokenCategory::Operator; + t.value = TokenValue::Bar; + if (idx < line_length()) + switch (frag[line][idx]) { + case ']': t.value = TokenValue::CloseMetaBracket; ++idx; break; + case '}': t.value = TokenValue::CloseMetaBrace; ++idx; break; + case ')': t.value = TokenValue::CloseMetaParen; ++idx; break; + default: break; + } + break; + + case '-': + t.category = TokenCategory::Operator; + t.value = TokenValue::Minus; + if (idx < line_length()) + switch (frag[line][idx]) { + case '>': t.value = TokenValue::RightArrow; ++idx; break; + case '-': + t.category = TokenCategory::Comment; + t.value = TokenValue::Wisecrack; + idx = frag[line].size(); + break; + } + break; + + case '+': + t.category = TokenCategory::Operator; + t.value = TokenValue::Plus; + if (idx < line_length()) + switch (frag[line][idx]) { + case '+': + t.category = TokenCategory::Comment; + t.value = TokenValue::Commentary; + idx = frag[line].size(); + break; + case '-': + if (idx + 1 < line_length() and frag[line][idx+1] == '>') { + t.value = TokenValue::MapsTo; + idx += 2; + } + break; + default: break; + } + break; + + case '.': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Dot; + if (idx < line_length() and frag[line][idx] == '.') { + t.category = TokenCategory::Operator; + t.value = TokenValue::DotDot; + ++idx; + } + break; + + case '"': + string(frag[line], idx, t); + break; + + case '$': + if (dialect != Language::Boot or idx >= line_length() + or separator_or_punctuator(frag[line][idx])) { + t.category = TokenCategory::Operator; + t.value = TokenValue::Dollar; + } + else + identifier(frag[line], idx, t, dialect); + break; + + default: + if (isdigit(c)) + number(frag[line], idx, t); + else if (identifier_head(c)) + identifier(frag[line], idx, t, dialect); + else + junk(frag[line], idx, t); + break; + } + + t.end = { frag[line].number, idx }; + return t; } } |