From ef059f3f675f384c68c15076dbcf220be1e01eee Mon Sep 17 00:00:00 2001 From: dos-reis Date: Tue, 26 Aug 2014 10:07:17 +0000 Subject: Add generic Boot and Spad tokenizer. --- src/include/dialect.H | 8 +- src/include/sexpr.H | 45 ++- src/include/token-value.def | 138 +++++++++ src/include/token.H | 691 +++++++++++++++++++++++++++++++++++--------- 4 files changed, 714 insertions(+), 168 deletions(-) create mode 100644 src/include/token-value.def (limited to 'src/include') diff --git a/src/include/dialect.H b/src/include/dialect.H index f63eac04..bcfddd04 100644 --- a/src/include/dialect.H +++ b/src/include/dialect.H @@ -1,4 +1,4 @@ -// Copyright (C) 2013, Gabriel Dos Reis. +// Copyright (C) 2013-2014, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -36,7 +36,11 @@ namespace OpenAxiom { // Languages for which we have parsers. enum class Language { - Spad, Boot, Lisp + Spad = 0x1, + Boot = 0x2, + Lisp = 0x4, + BootSpad = Spad | Boot, + All = Spad | Boot | Lisp, }; } diff --git a/src/include/sexpr.H b/src/include/sexpr.H index d425b6d8..84513a8b 100644 --- a/src/include/sexpr.H +++ b/src/include/sexpr.H @@ -1,4 +1,4 @@ -// Copyright (C) 2010-2013, Gabriel Dos Reis. +// Copyright (C) 2010-2014, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -55,28 +55,27 @@ namespace OpenAxiom { struct Lexeme { enum Type { unknown, // unidentified token - semicolon = token::value(";"), // comment - dot = token::value("."), - comma = token::value(","), - open_paren = token::value("("), - close_paren = token::value(")"), - apostrophe = token::value("'"), - backquote = token::value("`"), - backslash = token::value("\\"), - sharp_open_paren = token::value("#("), - sharp_apostrophe = token::value("#'"), - sharp_colon = token::value("#:"), - sharp_plus = token::value("#+"), - sharp_minus = token::value("#-"), - sharp_dot = token::value("#."), - comma_at = token::value(",@"), - digraph_end = token::value(0xff,0xff), - integer, // integer literal - character, // character literal - string, // string literal - identifier, // plain identifier - sharp_integer_equal, // anchor definition, #n=
- sharp_integer_sharp // back reference, #n# + semicolon, // ";" for comment + dot, // "." + comma, // "," + open_paren, // "(" + close_paren, // ")" + apostrophe, // "'" + backquote, // "`" + backslash, // "\\" + sharp_open_paren , // "#(" + sharp_apostrophe, // "#'" + sharp_colon, // "#:" + sharp_plus, // "#+" + sharp_minus, // "#-" + sharp_dot, // "#." + comma_at, // ",@" + integer, // integer literal + character, // character literal + string, // string literal + identifier, // plain identifier + sharp_integer_equal, // anchor definition, #n= + sharp_integer_sharp // back reference, #n# }; std::pair boundary; diff --git a/src/include/token-value.def b/src/include/token-value.def new file mode 100644 index 00000000..ea79c9a5 --- /dev/null +++ b/src/include/token-value.def @@ -0,0 +1,138 @@ +// Copyright (C) 2014, Gabriel Dos Reis. +// All rights reserved. +// Written by Gabriel Dos Reis. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// +// - Neither the name of OpenAxiom. nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +OPENAXIOM_DEFINE_TOKEN(Unknown, "", Unclassified, Language::All) +OPENAXIOM_DEFINE_TOKEN(Apostrophe, "'", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(Backquote, "`", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(Bar, "|", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Dot, ".", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(DotDot, "..", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Colon, ":", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(ColonColon, "::", Operator, Language::All) +OPENAXIOM_DEFINE_TOKEN(ColonDash, ":-", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(ColonEq, ":=", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(At, "@", Operator, Language::All) +OPENAXIOM_DEFINE_TOKEN(Exclamation, "!", Punctuator, Language::Boot) +OPENAXIOM_DEFINE_TOKEN(Comma, ",", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(Semicolon, ";", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(Star, "*", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(StarStar, "**", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Plus, "+", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Minus, "-", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Slash, "/", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(SlashSlash, "//", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(SlashBackslash, "/\\", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Backslash, "\\", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(BackslashSlash, "\\/", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(BackslashBackslash, "\\\\", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Less, "<", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(LessEq, "<=", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Greater, ">", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(GreaterEq, ">=", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Eq, "=", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(EqEq, "==", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Tilde, "~", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(TildeEq, "~=", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Caret, "^", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Pound, "#", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Dollar, "$", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Ampersand, "&", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(RightArrow, "->", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(LeftArrow, "<-", Operator, Language::Boot) +OPENAXIOM_DEFINE_TOKEN(Implies, "=>", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Equiv, "<=>", Keyword, Language::Boot) +OPENAXIOM_DEFINE_TOKEN(MapsTo, "+->", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(FatArrow, "==>", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(OpenParen, "(", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(CloseParen, ")", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(OpenMetaParen, "(|", Punctuator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(CloseMetaParen, "|)", Punctuator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(OpenBracket, "[", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(CloseBracket, "]", Punctuator, Language::All) +OPENAXIOM_DEFINE_TOKEN(OpenMetaBracket, "[|", Punctuator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(CloseMetaBracket, "|]", Punctuator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(OpenBrace, "{", Punctuator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(CloseBrace, "}", Punctuator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(OpenMetaBrace, "{|", Punctuator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(CloseMetaBrace, "|}", Punctuator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(OpenChevron, "<<", Operator, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(CloseChevron, ">>", Operator, Language::Spad) + +OPENAXIOM_DEFINE_TOKEN(Wisecrack, "--", Comment, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Commentary, "++", Comment, Language::BootSpad) + +OPENAXIOM_DEFINE_TOKEN(Add, "add", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(And, "and", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Assume, "assume", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Break, "break", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(By, "by", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Case, "case", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Catch, "catch", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Cross, "cross", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Do, "do", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Else, "else", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Exists, "exists", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Finally, "finally", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(For, "for", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Forall, "forall", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(From, "from", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Function, "function", Keyword, Language::Boot) +OPENAXIOM_DEFINE_TOKEN(Has, "has", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(If, "if", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Import, "import", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(In, "in", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Inline, "inline", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Is, "is", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Isnt, "isnt", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Iterate, "iterate", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Leave, "leave", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Macro, "macro", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Mod, "mod", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Namespace, "namespace", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Of, "of", Keyword, Language::Boot) +OPENAXIOM_DEFINE_TOKEN(Or, "or", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Pretend, "pretend", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Quo, "quo", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Rem, "rem", Operator, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Repeat, "repeat", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Return, "return", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Rule, "rule", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Structure, "structure", Keyword, Language::Boot) +OPENAXIOM_DEFINE_TOKEN(Then, "then", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(Throw, "throw", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Try, "try", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Until, "until", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(With, "with", Keyword, Language::Spad) +OPENAXIOM_DEFINE_TOKEN(Where, "where", Keyword, Language::BootSpad) +OPENAXIOM_DEFINE_TOKEN(While, "while", Keyword, Language::BootSpad) diff --git a/src/include/token.H b/src/include/token.H index ef203b12..3b3b2950 100644 --- a/src/include/token.H +++ b/src/include/token.H @@ -1,4 +1,4 @@ -// Copyright (C) 2013, Gabriel Dos Reis. +// Copyright (C) 2013-2014, Gabriel Dos Reis. // All rights reserved. // Written by Gabriel Dos Reis. // @@ -34,151 +34,556 @@ #define OPENAXIOM_TOKEN_included #include +#include +#include #include +#include namespace OpenAxiom { - namespace token { - // -- Underlying representation of a token class. - using base_type = uint32_t; - - // -- 8-bit byte data type - using u8 = uint8_t; - - constexpr base_type value(u8 c) { return c; } - constexpr base_type value(u8 hi, u8 lo) { return (hi << 8) | lo; } - constexpr base_type value(u8 hi, u8 mi, u8 lo) { - return (value(hi, mi) << 8) | lo; - } - - // -- Type of literal strings of given number of characters. - template - using text_chunk = const char(&)[N+1]; - - // -- Return the token value of certain literal strings. - constexpr base_type value(text_chunk<0>) { return u8(); } - constexpr base_type value(text_chunk<1> s) { - return value(s[0]); - } - constexpr base_type value(text_chunk<2> s) { - return value(s[0], s[1]); - } - constexpr base_type value(text_chunk<3> s) { - return value(s[0], s[1], s[2]); - } - - // -- Abstract values of tokens. - enum Value : base_type { - Unknown = value(""), - Bar = value("|"), - Dot = value("."), - DotDot = value(".."), - Colon = value(":"), - ColonColon = value("::"), - ColonDash = value(":-"), - ColonEq = value(":="), - At = value("@"), - Comma = value(","), - Semicolon = value(";"), - Star = value("*"), - Plus = value("+"), - Minus = value("-"), - Slash = value("/"), - Backslash = value("\\"), - SlashSlash = value("//"), - BackslashBackslash = value("\\\\"), - BackslashSlash = value("\\/"), - SlashBackslash = value("/\\"), - Less = value("<"), - LessEq = value("<="), - Greater = value(">"), - GreaterEq = value(">="), - Eq = value("="), - EqEq = value("=="), - Tilde = value("~"), - TildeEq = value("~="), - Caret = value("^"), - Pound = value("#"), - Dollar = value("$"), - Ampersand = value("&"), - OpenParen = value("("), - CloseParen = value(")"), - OpenBracket = value("["), - CloseBracket = value("]"), - OpenBrace = value("{"), - CloseBrace = value("}"), - OpenMetParen = value("(|"), - CloseMetaParen = value("|)"), - OpenMetaBracket = value("[|"), - CloseMetaBracket = value("|]"), - OpenMetaBrace = value("{|"), - CloseMetaBrace = value("|}"), - Apostrophe = value("'"), - Backquote = value("`"), - StarStar = value("**"), - Implies = value("=>"), - RightArrow = value("->"), - LeftArrow = value("<-"), - OpenChevron = value("<<"), - CloseChevron = value(">>"), - FatArrow = value("==>"), - Equiv = value("<=>"), - MapsTo = value("+->"), - - Add = value("add"), - And = value("and"), - By = value("by"), - Do = value("do"), - For = value("for"), - Has = value("has"), - If = value("if"), - In = value("in"), - Is = value("is"), - Mod = value("mod"), - Of = value("of"), // -- Boot only - Or = value("or"), - Quo = value("quo"), - Rem = value("rem"), - Try = value("try"), - LastTrigraph = 0xffffff, - - Assume, // "assume" - Break, // "break" - Case, // "case" - Catch, // "catch" - Cross, // "cross" - Else, // "else" - Exists, // "exists" - Finally, // "finally" - From, // "from" - Forall, // "forall" - Function, // "function" -- Boot only - Import, // "import" - Inline, // "inline" - Isnt, // "isnt" - Iterate, // "iterate" - Leave, // "leave" - Macro, // "macro" - Module, // "module" -- Boot only - Namespace, // "namespace" -- Boot only - Pretend, // "pretend" - Repeat, // "repeat" - Return, // "return" - Rule, // "rule" - Structure, // "structure" -- Boot only - Then, // "then" - Throw, // "throw" - Until, // "until" - With, // "with" - Where, // "where" - While, // "while" - - IntegerLiteral, // integer literal - StringLiteral, // string literal - FPLiteral, // floating point literal - Indent, // new line indentation, greater than previous - Unindent, // new line indentation, less than previous - Justify, // align indentation with preceding line. - }; + // Categorization of Boot and Spad tokens. + enum class TokenCategory : uint8_t { + Unclassified, // token of unknown class + Whitespace, // sequence of white-space characters + Comment, // a description of an ignorable comment + Punctuator, // a punctuator character + Operator, // an operator both symbolic and alphabetic + Integer, // an integer literal + FloatingPoint, // a floating-point literal + String, // a string literal + Keyword, // a reserved word both symbolic and alphabetic + Identifier, // an identifier + Formatting, // a layout formatting token + Junk, // invalid/malformed token + EOS // end-of-token-stream indicator + }; + + std::ostream& operator<<(std::ostream&, TokenCategory); + + // The abstract value associated with a token. + enum class TokenValue : uint8_t { +#undef OPENAXIOM_DEFINE_TOKEN +#define OPENAXIOM_DEFINE_TOKEN(T, ...) T, +#include +#undef OPENAXIOM_DEFINE_TOKEN + Artificial, // Tokens after this are artificial + Indent, // new line indentation, greater than previous + Unindent, // new line indentation, less than previous + Justify, // align indentation with preceding line. + + EndOfStream // end of token stream + }; + + std::ostream& operator<<(std::ostream&, TokenValue); + + // Given a symbolic or alphabetic token, retrieve its category + // and associated abstract value. + struct TokenClassification { + TokenCategory category; + TokenValue value; + + explicit operator bool() const { + return category != TokenCategory::Unclassified; + } + }; + + TokenClassification classify(const std::string&); + + // Datatypes for locating lines and columns. + using LineNumber = std::size_t; + using ColumnIndex = std::size_t; + + // -- Exception types + struct EndOfStringUnseen { + LineNumber line; + ColumnIndex column; + }; + + struct MissingExponent { + LineNumber line; + ColumnIndex column; + }; + + // Object of this datatype decompose a program fragment into a + // token stream. The tokens are of type indicated by Tok. + template + struct TokenStream { + TokenStream(Frag& f) + : frag(f), + line(), + idx(frag.front().indent) + { + indents.push(idx); + } + + bool eos() const { + return line >= frag.size() + or (line + 1 == frag.size() and idx >= frag.back().size()); + } + + Tok get(Language = Language::Spad); + private: + Frag& frag; + std::size_t line; + std::size_t idx; + std::stack indents; + + std::size_t line_length() const { return frag[line].size(); } + LineNumber next_line_number() const { + return line + 1 < frag.size() + ? frag[line + 1].number + : frag.back().number + 1; + } + ColumnIndex next_indentation() const { + return line + 1 < frag.size() ? frag[line + 1].indent : 0; + } + + LineNumber line_number() const { + return line < frag.size() + ? frag[line].number + : frag.back().number + 1; + } + + ColumnIndex column_number() const { + return line < frag.size() ? idx : 0; + } + + using Locus = typename Tok::Location; + Locus current_locus() { + return { line_number(), column_number() }; + } + }; + + bool separator_or_punctuator(uint8_t); + + template + static void junk(L& line, ColumnIndex& idx, T& t) { + while (idx < line.size() and not separator_or_punctuator(line[idx])) + ++idx; + t.category = TokenCategory::Junk; + } + + template + inline void + skip_whitespace(L& line, ColumnIndex& idx) { + while (idx < line.size() and isspace(line[idx])) + ++idx; + } + + template + void string(L& line, ColumnIndex& idx, T& t) { + bool done = false; + bool escape = false; + while (idx < line.size() && not done) { + switch (line[idx++]) { + case '_': escape = !escape; break; + case '"': done = !escape; + // fallthrough + default: escape = false; break; + } + } + if (not done) + throw EndOfStringUnseen{ line.number, idx }; + t.category = TokenCategory::String; + } + + template + void skip_to_end_of_integer(L& line, ColumnIndex& idx) { + while (idx < line.size() and isdigit(line[idx])) + ++idx; + } + + template + void integer(L& line, ColumnIndex& idx, T& t) { + skip_to_end_of_integer(line, idx); + t.category = TokenCategory::Integer; + } + + template + T& number(L& line, ColumnIndex& idx, T& t) { + integer(line, idx, t); + if (idx >= line.size() or line[idx] != '.') + return t; + if (++idx >= line.size() or not isdigit(line[idx])) { + --idx; + return t; + } + + t.category = TokenCategory::FloatingPoint; + skip_to_end_of_integer(line, idx); + if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E')) + return t; + if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-')) + ++idx; + if (idx >= line.size() or not isdigit(line[idx])) + throw MissingExponent{ line.number, idx }; + skip_to_end_of_integer(line, idx); + return t; + } + + inline bool + identifier_head(uint8_t c) { + return isalpha(c) or c == '%' or c == '_'; + } + + inline bool + identifier_part(uint8_t c) { + return identifier_head(c) or isdigit(c); + } + + inline bool + identifier_suffix(uint8_t c) { + return c == '!' or c == '?'; + } + + inline bool internal_prefix(uint8_t c) { + return c == '%' or c == '$'; + } + + template + inline void + skip_prefix(L& line, ColumnIndex& idx, uint8_t c) { + while (idx < line.size() and line[idx] == c) + ++idx; + } + + template + T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) { + t.category = TokenCategory::Identifier; + + ColumnIndex start = --idx; // idx was ahead by 1. + if (dialect == Language::Boot and internal_prefix(line[idx])) + skip_prefix(line, idx, line[idx]); + bool saw_escape = false; + while (idx < line.size()) { + if (not identifier_part(line[idx]) and line[idx - 1] != '_') + break; + else if (line[idx] == '_') + saw_escape = true; + ++idx; + } + while (idx < line.size() and identifier_suffix(line[idx])) + ++idx; + + if (saw_escape) + t.category = TokenCategory::Identifier; + else if (auto info = classify(line.sub_string(start, idx))) { + t.category = info.category; + t.value = info.value; + } + return t; + } + + template + Tok TokenStream::get(Language dialect) { + Tok t { }; + t.start = current_locus(); + + if (eos()) { + t.category = TokenCategory::EOS; + t.end = current_locus(); + return t; + } + else if (isspace(frag[line][idx])) { + skip_whitespace(frag[line], idx); + t.category = TokenCategory::Whitespace; + t.end = current_locus(); + return t; + } + else if (idx == line_length() - 1 and frag[line].back() == '_') { + ++line; + idx = frag[line].indent; + } + else if (idx == line_length()) { + auto indent = indents.top(); + auto next_indent = next_indentation(); + t.start = t.end = { next_line_number(), next_indent }; + if (indent < next_indent) { + indents.push(next_indent); + ++line; + idx = next_indent; + t.category = TokenCategory::Formatting; + t.value = TokenValue::Indent; + } + else if (indent > next_indent) { + indents.pop(); + t.category = TokenCategory::Formatting; + t.value = TokenValue::Unindent; + } + else { + ++line; + idx = next_indent; + t.category = TokenCategory::Formatting; + t.value = TokenValue::Justify; + } + return t; + } + + switch (auto c = frag[line][idx++]) { + case '#': + t.category = TokenCategory::Operator; + t.value = TokenValue::Pound; + break; + + case '@': + t.category = TokenCategory::Operator; + t.value = TokenValue::At; + break; + + case '^': + t.category = TokenCategory::Operator; + t.value = TokenValue::Caret; + break; + + case '&': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Ampersand; + break; + + case '!': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Exclamation; + break; + + case '\'': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Apostrophe; + break; + case ',': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Comma; + break; + + case ';': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Semicolon; + break; + + case '`': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Backquote; + break; + + case '(': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::OpenParen; + if (idx < line_length() and frag[line][idx] == '|') { + ++idx; + t.value = TokenValue::OpenMetaParen; + } + break; + + case ')': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::CloseParen; + break; + + case '{': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::OpenBrace; + if (idx < line_length() and frag[line][idx] == '|') { + ++idx; + t.value = TokenValue::OpenMetaBrace; + } + break; + + case '}': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::CloseBrace; + break; + + case '[': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::OpenBracket; + if (idx < line_length() and frag[line][idx] == '|') { + ++idx; + t.value = TokenValue::OpenMetaBracket; + } + break; + + case ']': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::CloseBracket; + break; + + case ':': + t.category = TokenCategory::Operator; + t.value = TokenValue::Colon; + if (idx < line_length()) + switch (frag[line][idx]) { + case ':': t.value = TokenValue::ColonColon; ++idx; break; + case '=': t.value = TokenValue::ColonEq; ++idx; break; + case '-': t.value = TokenValue::ColonDash; ++idx; break; + default: break; + } + break; + + case '*': + t.category = TokenCategory::Operator; + t.value = TokenValue::Star; + if (idx < line_length() and frag[line][idx] == '*') { + t.value = TokenValue::StarStar; + ++idx; + } + break; + + case '/': + t.category = TokenCategory::Operator; + t.value = TokenValue::Slash; + if (idx < line_length()) + switch (frag[line][idx]) { + case '/': t.value = TokenValue::SlashSlash; ++idx; break; + case '\\': t.value = TokenValue::SlashBackslash; ++idx; break; + default: break; + } + break; + + case '\\': + t.category = TokenCategory::Operator; + t.value = TokenValue::Backslash; + if (idx < line_length()) + switch (frag[line][idx]) { + case '\\': t.value = TokenValue::BackslashBackslash; ++idx; break; + case '/': t.value = TokenValue::BackslashSlash; ++idx; break; + default: break; + } + break; + + case '<': + t.category = TokenCategory::Operator; + t.value = TokenValue::Less; + if (idx < line_length()) + switch (frag[line][idx]) { + case '-': t.value = TokenValue::LeftArrow; ++idx; break; + case '<': t.value = TokenValue::OpenChevron; ++idx; break; + case '=': + t.value = TokenValue::LessEq; + if (++idx < line_length() and frag[line][idx] == '>') { + t.value = TokenValue::Equiv; + ++idx; + } + break; + default: break; + } + break; + + case '=': + t.category = TokenCategory::Operator; + t.value = TokenValue::Eq; + if (idx < line_length()) + switch (frag[line][idx]) { + case '>': t.value = TokenValue::Implies; ++idx; break; + case '=': + t.value = TokenValue::EqEq; + if (++idx < line_length() and frag[line][idx] == '>') { + t.value = TokenValue::FatArrow; + ++idx; + } + break; + default: break; + } + break; + + case '~': + t.category = TokenCategory::Operator; + t.value = TokenValue::Tilde; + if (idx < line_length() and frag[line][idx] == '=') { + t.value = TokenValue::TildeEq; + ++idx; + } + break; + + case '>': + t.category = TokenCategory::Operator; + t.value = TokenValue::Greater; + if (idx < line_length()) + switch (frag[line][idx]) { + case '=': t.value = TokenValue::GreaterEq; ++idx; break; + case '>': t.value = TokenValue::CloseChevron; ++idx; break; + } + break; + + case '|': + t.category = TokenCategory::Operator; + t.value = TokenValue::Bar; + if (idx < line_length()) + switch (frag[line][idx]) { + case ']': t.value = TokenValue::CloseMetaBracket; ++idx; break; + case '}': t.value = TokenValue::CloseMetaBrace; ++idx; break; + case ')': t.value = TokenValue::CloseMetaParen; ++idx; break; + default: break; + } + break; + + case '-': + t.category = TokenCategory::Operator; + t.value = TokenValue::Minus; + if (idx < line_length()) + switch (frag[line][idx]) { + case '>': t.value = TokenValue::RightArrow; ++idx; break; + case '-': + t.category = TokenCategory::Comment; + t.value = TokenValue::Wisecrack; + idx = frag[line].size(); + break; + } + break; + + case '+': + t.category = TokenCategory::Operator; + t.value = TokenValue::Plus; + if (idx < line_length()) + switch (frag[line][idx]) { + case '+': + t.category = TokenCategory::Comment; + t.value = TokenValue::Commentary; + idx = frag[line].size(); + break; + case '-': + if (idx + 1 < line_length() and frag[line][idx+1] == '>') { + t.value = TokenValue::MapsTo; + idx += 2; + } + break; + default: break; + } + break; + + case '.': + t.category = TokenCategory::Punctuator; + t.value = TokenValue::Dot; + if (idx < line_length() and frag[line][idx] == '.') { + t.category = TokenCategory::Operator; + t.value = TokenValue::DotDot; + ++idx; + } + break; + + case '"': + string(frag[line], idx, t); + break; + + case '$': + if (dialect != Language::Boot or idx >= line_length() + or separator_or_punctuator(frag[line][idx])) { + t.category = TokenCategory::Operator; + t.value = TokenValue::Dollar; + } + else + identifier(frag[line], idx, t, dialect); + break; + + default: + if (isdigit(c)) + number(frag[line], idx, t); + else if (identifier_head(c)) + identifier(frag[line], idx, t, dialect); + else + junk(frag[line], idx, t); + break; + } + + t.end = { frag[line].number, idx }; + return t; } } -- cgit v1.2.3