Add generic Boot and Spad tokenizer.

author: dos-reis <gdr@axiomatics.org> 2014-08-26 10:07:17 +0000
committer: dos-reis <gdr@axiomatics.org> 2014-08-26 10:07:17 +0000
commit: ef059f3f675f384c68c15076dbcf220be1e01eee (patch)
tree: 08124f18e4f7a3044b719ae860e3b492ed704287 /src/include
parent: cfffc75b762f4364623f85a887b7e564421e3127 (diff)
download: open-axiom-ef059f3f675f384c68c15076dbcf220be1e01eee.tar.gz
4 files changed, 714 insertions, 168 deletions
diff --git a/src/include/dialect.H b/src/include/dialect.H
index f63eac04..bcfddd04 100644
--- a/src/include/dialect.H
+++ b/src/include/dialect.H
@@ -1,4 +1,4 @@
-// Copyright (C) 2013, Gabriel Dos Reis.
+// Copyright (C) 2013-2014, Gabriel Dos Reis.
 // All rights reserved.
 // Written by Gabriel Dos Reis.
 //
@@ -36,7 +36,11 @@
 namespace OpenAxiom {
    // Languages for which we have parsers.
    enum class Language {
-      Spad, Boot, Lisp
+      Spad = 0x1,
+      Boot = 0x2,
+      Lisp = 0x4,
+      BootSpad = Spad | Boot,
+      All = Spad | Boot | Lisp,
    };
 }
 
diff --git a/src/include/sexpr.H b/src/include/sexpr.H
index d425b6d8..84513a8b 100644
--- a/src/include/sexpr.H
+++ b/src/include/sexpr.H
@@ -1,4 +1,4 @@
-// Copyright (C) 2010-2013, Gabriel Dos Reis.
+// Copyright (C) 2010-2014, Gabriel Dos Reis.
 // All rights reserved.
 // Written by Gabriel Dos Reis.
 //
@@ -55,28 +55,27 @@ namespace OpenAxiom {
       struct Lexeme {
          enum Type {
             unknown,                 // unidentified token
-            semicolon        = token::value(";"), // comment
-            dot              = token::value("."),
-            comma            = token::value(","),
-            open_paren       = token::value("("),
-            close_paren      = token::value(")"),
-            apostrophe       = token::value("'"),
-            backquote        = token::value("`"),
-            backslash        = token::value("\\"),
-            sharp_open_paren = token::value("#("),
-            sharp_apostrophe = token::value("#'"),
-            sharp_colon      = token::value("#:"),
-            sharp_plus       = token::value("#+"),
-            sharp_minus      = token::value("#-"),
-            sharp_dot        = token::value("#."),
-            comma_at         = token::value(",@"),
-            digraph_end      = token::value(0xff,0xff),
-            integer,                // integer literal
-            character,              // character literal
-            string,                 // string literal
-            identifier,             // plain identifier
-            sharp_integer_equal,    // anchor definition, #n=<form>
-            sharp_integer_sharp     // back reference, #n#
+            semicolon,               // ";" for comment
+            dot,                     // "."
+            comma,                   // ","
+            open_paren,              // "("
+            close_paren,             // ")"
+            apostrophe,              // "'"
+            backquote,               // "`"
+            backslash,               // "\\"
+            sharp_open_paren ,       // "#("
+            sharp_apostrophe,        // "#'"
+            sharp_colon,             // "#:"
+            sharp_plus,              // "#+"
+            sharp_minus,             // "#-"
+            sharp_dot,               // "#."
+            comma_at,                // ",@"
+            integer,                 // integer literal
+            character,               // character literal
+            string,                  // string literal
+            identifier,              // plain identifier
+            sharp_integer_equal,     // anchor definition, #n=<form>
+            sharp_integer_sharp      // back reference, #n#
          };
 
          std::pair<const Byte*, const Byte*> boundary;
diff --git a/src/include/token-value.def b/src/include/token-value.def
new file mode 100644
index 00000000..ea79c9a5
--- /dev/null
+++ b/src/include/token-value.def
@@ -0,0 +1,138 @@
+// Copyright (C) 2014, Gabriel Dos Reis.
+// All rights reserved.
+// Written by Gabriel Dos Reis.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     - Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//
+//     - Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in
+//       the documentation and/or other materials provided with the
+//       distribution.
+//
+//     - Neither the name of OpenAxiom. nor the names of its contributors
+//       may be used to endorse or promote products derived from this
+//       software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+OPENAXIOM_DEFINE_TOKEN(Unknown, "<unknown>", Unclassified, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Apostrophe, "'", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Backquote, "`", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Bar, "|", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Dot, ".", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(DotDot, "..", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Colon, ":", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(ColonColon, "::", Operator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(ColonDash, ":-", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(ColonEq, ":=", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(At, "@", Operator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Exclamation, "!", Punctuator, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Comma, ",", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Semicolon, ";", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Star, "*", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(StarStar, "**", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Plus, "+", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Minus, "-", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Slash, "/", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(SlashSlash, "//", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(SlashBackslash, "/\\", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Backslash, "\\", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(BackslashSlash, "\\/", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(BackslashBackslash, "\\\\", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Less, "<", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(LessEq, "<=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Greater, ">", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(GreaterEq, ">=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Eq, "=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(EqEq, "==", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Tilde, "~", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(TildeEq, "~=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Caret, "^", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Pound, "#", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Dollar, "$", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Ampersand, "&", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(RightArrow, "->", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(LeftArrow, "<-", Operator, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Implies, "=>", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Equiv, "<=>", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(MapsTo, "+->", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(FatArrow, "==>", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenParen, "(", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(CloseParen, ")", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(OpenMetaParen, "(|", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseMetaParen, "|)", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenBracket, "[", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(CloseBracket, "]", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(OpenMetaBracket, "[|", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseMetaBracket, "|]", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenBrace, "{", Punctuator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(CloseBrace, "}", Punctuator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(OpenMetaBrace, "{|", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseMetaBrace, "|}", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenChevron, "<<", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseChevron, ">>", Operator, Language::Spad)
+
+OPENAXIOM_DEFINE_TOKEN(Wisecrack, "--", Comment, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Commentary, "++", Comment, Language::BootSpad)
+
+OPENAXIOM_DEFINE_TOKEN(Add, "add", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(And, "and", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Assume, "assume", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Break, "break", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(By, "by", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Case, "case", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Catch, "catch", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Cross, "cross", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Do, "do", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Else, "else", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Exists, "exists", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Finally, "finally", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(For, "for", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Forall, "forall", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(From, "from", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Function, "function", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Has, "has", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(If, "if", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Import, "import", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(In, "in", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Inline, "inline", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Is, "is", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Isnt, "isnt", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Iterate, "iterate", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Leave, "leave", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Macro, "macro", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Mod, "mod", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Namespace, "namespace", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Of, "of", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Or, "or", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Pretend, "pretend", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Quo, "quo", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Rem, "rem", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Repeat, "repeat", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Return, "return", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Rule, "rule", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Structure, "structure", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Then, "then", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Throw, "throw", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Try, "try", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Until, "until", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(With, "with", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Where, "where", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(While, "while", Keyword, Language::BootSpad)
diff --git a/src/include/token.H b/src/include/token.H
index ef203b12..3b3b2950 100644
--- a/src/include/token.H
+++ b/src/include/token.H
@@ -1,4 +1,4 @@
-// Copyright (C) 2013, Gabriel Dos Reis.
+// Copyright (C) 2013-2014, Gabriel Dos Reis.
 // All rights reserved.
 // Written by Gabriel Dos Reis.
 //
@@ -34,151 +34,556 @@
 #define OPENAXIOM_TOKEN_included
 
 #include <stdint.h>
+#include <stack>
+#include <iosfwd>
 #include <open-axiom/Input>
+#include <open-axiom/dialect>
 
 namespace OpenAxiom {
-   namespace token {
-      // -- Underlying representation of a token class.
-      using base_type = uint32_t;
-
-      // -- 8-bit byte data type
-      using u8 = uint8_t;
-
-      constexpr base_type value(u8 c) { return c; }
-      constexpr base_type value(u8 hi, u8 lo) { return (hi << 8) | lo; }
-      constexpr base_type value(u8 hi, u8 mi, u8 lo) {
-         return (value(hi, mi) << 8) | lo;
-      }
-
-      // -- Type of literal strings of given number of characters.
-      template<int N>
-      using text_chunk = const char(&)[N+1];
-
-      // -- Return the token value of certain literal strings.
-      constexpr base_type value(text_chunk<0>) { return u8(); }
-      constexpr base_type value(text_chunk<1> s) {
-         return value(s[0]);
-      }
-      constexpr base_type value(text_chunk<2> s) {
-         return value(s[0], s[1]);
-      }
-      constexpr base_type value(text_chunk<3> s) {
-         return value(s[0], s[1], s[2]);
-      }
-
-      // -- Abstract values of tokens.
-      enum Value : base_type {
-         Unknown                = value(""),
-         Bar                    = value("|"),
-         Dot                    = value("."),
-         DotDot                 = value(".."),
-         Colon                  = value(":"),
-         ColonColon             = value("::"),
-         ColonDash              = value(":-"),
-         ColonEq                = value(":="),
-         At                     = value("@"),
-         Comma                  = value(","),
-         Semicolon              = value(";"),
-         Star                   = value("*"),
-         Plus                   = value("+"),
-         Minus                  = value("-"),
-         Slash                  = value("/"),
-         Backslash              = value("\\"),
-         SlashSlash             = value("//"),
-         BackslashBackslash     = value("\\\\"),
-         BackslashSlash         = value("\\/"),
-         SlashBackslash         = value("/\\"),
-         Less                   = value("<"),
-         LessEq                 = value("<="),
-         Greater                = value(">"),
-         GreaterEq              = value(">="),
-         Eq                     = value("="),
-         EqEq                   = value("=="),
-         Tilde                  = value("~"),
-         TildeEq                = value("~="),
-         Caret                  = value("^"),
-         Pound                  = value("#"),
-         Dollar                 = value("$"),
-         Ampersand              = value("&"),
-         OpenParen              = value("("),
-         CloseParen             = value(")"),
-         OpenBracket            = value("["),
-         CloseBracket           = value("]"),
-         OpenBrace              = value("{"),
-         CloseBrace             = value("}"),
-         OpenMetParen           = value("(|"),
-         CloseMetaParen         = value("|)"),
-         OpenMetaBracket        = value("[|"),
-         CloseMetaBracket       = value("|]"),
-         OpenMetaBrace          = value("{|"),
-         CloseMetaBrace         = value("|}"),
-         Apostrophe             = value("'"),
-         Backquote              = value("`"),
-         StarStar               = value("**"),
-         Implies                = value("=>"),
-         RightArrow             = value("->"),
-         LeftArrow              = value("<-"),
-         OpenChevron            = value("<<"),
-         CloseChevron           = value(">>"),
-         FatArrow               = value("==>"),
-         Equiv                  = value("<=>"),
-         MapsTo                 = value("+->"),
-
-         Add                    = value("add"),
-         And                    = value("and"),
-         By                     = value("by"),
-         Do                     = value("do"),
-         For                    = value("for"),
-         Has                    = value("has"),
-         If                     = value("if"),
-         In                     = value("in"),
-         Is                     = value("is"),
-         Mod                    = value("mod"),
-         Of                     = value("of"),    // -- Boot only
-         Or                     = value("or"),
-         Quo                    = value("quo"),
-         Rem                    = value("rem"),
-         Try                    = value("try"),
-         LastTrigraph           = 0xffffff,
-
-         Assume,             // "assume"
-         Break,              // "break"
-         Case,               // "case"
-         Catch,              // "catch"
-         Cross,              // "cross"
-         Else,               // "else"
-         Exists,             // "exists"
-         Finally,            // "finally"
-         From,               // "from"
-         Forall,             // "forall"
-         Function,           // "function" -- Boot only
-         Import,             // "import"
-         Inline,             // "inline"
-         Isnt,               // "isnt"
-         Iterate,            // "iterate"
-         Leave,              // "leave"
-         Macro,              // "macro"
-         Module,             // "module"   -- Boot only
-         Namespace,          // "namespace" -- Boot only
-         Pretend,            // "pretend"
-         Repeat,             // "repeat"
-         Return,             // "return"
-         Rule,               // "rule"
-         Structure,          // "structure" -- Boot only
-         Then,               // "then"
-         Throw,              // "throw"
-         Until,              // "until"
-         With,               // "with"
-         Where,              // "where"
-         While,              // "while"
-
-         IntegerLiteral,     // integer literal
-         StringLiteral,      // string literal
-         FPLiteral,          // floating point literal
-         Indent,             // new line indentation, greater than previous
-         Unindent,           // new line indentation, less than previous
-         Justify,            // align indentation with preceding line.
-      };
+   // Categorization of Boot and Spad tokens.
+   enum class TokenCategory : uint8_t {
+      Unclassified,             // token of unknown class
+      Whitespace,               // sequence of white-space characters
+      Comment,                  // a description of an ignorable comment
+      Punctuator,               // a punctuator character
+      Operator,                 // an operator both symbolic and alphabetic
+      Integer,                  // an integer literal
+      FloatingPoint,            // a floating-point literal
+      String,                   // a string literal
+      Keyword,                  // a reserved word both symbolic and alphabetic
+      Identifier,               // an identifier
+      Formatting,               // a layout formatting token
+      Junk,                     // invalid/malformed token
+      EOS                       // end-of-token-stream indicator
+   };
+
+   std::ostream& operator<<(std::ostream&, TokenCategory);
+
+   // The abstract value associated with a token.
+   enum class TokenValue : uint8_t {
+#undef OPENAXIOM_DEFINE_TOKEN
+#define OPENAXIOM_DEFINE_TOKEN(T, ...)  T,
+#include <open-axiom/token-value>
+#undef OPENAXIOM_DEFINE_TOKEN
+      Artificial,               // Tokens after this are artificial
+      Indent,                   // new line indentation, greater than previous
+      Unindent,                 // new line indentation, less than previous
+      Justify,                  // align indentation with preceding line.
+
+      EndOfStream               // end of token stream
+   };
+
+   std::ostream& operator<<(std::ostream&, TokenValue);
+
+   // Given a symbolic or alphabetic token, retrieve its category
+   // and associated abstract value.
+   struct TokenClassification {
+      TokenCategory category;
+      TokenValue value;
+
+      explicit operator bool() const {
+         return category != TokenCategory::Unclassified;
+      }
+   };
+
+   TokenClassification classify(const std::string&);
+
+   // Datatypes for locating lines and columns.
+   using LineNumber = std::size_t;
+   using ColumnIndex = std::size_t;
+
+   // -- Exception types
+   struct EndOfStringUnseen {
+      LineNumber line;
+      ColumnIndex column;
+   };
+
+   struct MissingExponent {
+      LineNumber line;
+      ColumnIndex column;
+   };
+
+   // Object of this datatype decompose a program fragment into a
+   // token stream.  The tokens are of type indicated by Tok.
+   template<typename Frag, typename Tok>
+   struct TokenStream {
+      TokenStream(Frag& f)
+            : frag(f),
+              line(),
+              idx(frag.front().indent)
+      {
+         indents.push(idx);
+      }
+
+      bool eos() const {
+         return line >= frag.size()
+            or (line + 1 == frag.size() and idx >= frag.back().size());
+      }
+
+      Tok get(Language = Language::Spad);
+   private:
+      Frag& frag;
+      std::size_t line;
+      std::size_t idx;
+      std::stack<ColumnIndex> indents;
+
+      std::size_t line_length() const { return frag[line].size(); }
+      LineNumber next_line_number() const {
+         return line + 1 < frag.size()
+            ? frag[line + 1].number
+            : frag.back().number + 1;
+      }
+      ColumnIndex next_indentation() const {
+         return line + 1 < frag.size() ? frag[line + 1].indent : 0;
+      }
+
+      LineNumber line_number() const {
+         return line < frag.size()
+            ? frag[line].number
+            : frag.back().number + 1;
+      }
+
+      ColumnIndex column_number() const {
+         return line < frag.size() ? idx : 0;
+      }
+
+      using Locus = typename Tok::Location;
+      Locus current_locus() {
+         return { line_number(), column_number() };
+      }
+   };
+
+   bool separator_or_punctuator(uint8_t);
+
+   template<typename L, typename T>
+   static void junk(L& line, ColumnIndex& idx, T& t) {
+      while (idx < line.size() and not separator_or_punctuator(line[idx]))
+         ++idx;
+      t.category = TokenCategory::Junk;
+   }
+
+   template<typename L>
+   inline void
+   skip_whitespace(L& line, ColumnIndex& idx) {
+      while (idx < line.size() and isspace(line[idx]))
+         ++idx;
+   }
+
+   template<typename L, typename T>
+   void string(L& line, ColumnIndex& idx, T& t) {
+      bool done = false;
+      bool escape = false;
+      while (idx < line.size() && not done) {
+         switch (line[idx++]) {
+         case '_': escape = !escape; break;
+         case '"': done = !escape;
+            // fallthrough
+         default: escape = false; break;
+         }
+      }
+      if (not done)
+         throw EndOfStringUnseen{ line.number, idx };
+      t.category = TokenCategory::String;
+   }
+
+   template<typename L>
+   void skip_to_end_of_integer(L& line, ColumnIndex& idx) {
+      while (idx < line.size() and isdigit(line[idx]))
+         ++idx;
+   }
+   
+   template<typename L, typename T>
+   void integer(L& line, ColumnIndex& idx, T& t) {
+      skip_to_end_of_integer(line, idx);
+      t.category = TokenCategory::Integer;
+   }
+
+   template<typename L, typename T>
+   T& number(L& line, ColumnIndex& idx, T& t) {
+      integer(line, idx, t);
+      if (idx >= line.size() or line[idx] != '.')
+         return t;
+      if (++idx >= line.size() or not isdigit(line[idx])) {
+         --idx;
+         return t;
+      }
+
+      t.category = TokenCategory::FloatingPoint;
+      skip_to_end_of_integer(line, idx);
+      if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E'))
+         return t;
+      if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-'))
+         ++idx;
+      if (idx >= line.size() or not isdigit(line[idx]))
+         throw MissingExponent{ line.number, idx };
+      skip_to_end_of_integer(line, idx);
+      return t;
+   }
+
+   inline bool
+   identifier_head(uint8_t c) {
+      return isalpha(c) or c == '%' or c == '_';
+   }
+
+   inline bool
+   identifier_part(uint8_t c) {
+      return identifier_head(c) or isdigit(c);
+   }
+
+   inline bool
+   identifier_suffix(uint8_t c) {
+      return c == '!' or c == '?';
+   }
+
+   inline bool internal_prefix(uint8_t c) {
+      return c == '%' or c == '$';
+   }
+
+   template<typename L>
+   inline void
+   skip_prefix(L& line, ColumnIndex& idx, uint8_t c) {
+      while (idx < line.size() and line[idx] == c)
+         ++idx;
+   }
+
+   template<typename L, typename T>
+   T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) {
+      t.category = TokenCategory::Identifier;
+
+      ColumnIndex start = --idx; // idx was ahead by 1.
+      if (dialect == Language::Boot and internal_prefix(line[idx]))
+         skip_prefix(line, idx, line[idx]);
+      bool saw_escape = false;
+      while (idx < line.size()) {
+         if (not identifier_part(line[idx]) and line[idx - 1] != '_')
+            break;
+         else if (line[idx] == '_')
+            saw_escape = true;
+         ++idx;
+      }
+      while (idx < line.size() and identifier_suffix(line[idx]))
+         ++idx;
+
+      if (saw_escape)
+         t.category = TokenCategory::Identifier;
+      else if (auto info = classify(line.sub_string(start, idx))) {
+         t.category = info.category;
+         t.value = info.value;
+      }
+      return t;
+   }
+
+   template<typename Frag, typename Tok>
+   Tok TokenStream<Frag, Tok>::get(Language dialect) {
+      Tok t { };
+      t.start = current_locus();
+      
+      if (eos()) {
+         t.category = TokenCategory::EOS;
+         t.end = current_locus();
+         return t;
+      }
+      else if (isspace(frag[line][idx])) {
+         skip_whitespace(frag[line], idx);
+         t.category = TokenCategory::Whitespace;
+         t.end = current_locus();
+         return t;
+      }
+      else if (idx == line_length() - 1 and frag[line].back() == '_') {
+         ++line;
+         idx = frag[line].indent;
+      }
+      else if (idx == line_length()) {
+         auto indent = indents.top();
+         auto next_indent = next_indentation();
+         t.start = t.end = { next_line_number(), next_indent };
+         if (indent < next_indent) {
+            indents.push(next_indent);
+            ++line;
+            idx = next_indent;
+            t.category = TokenCategory::Formatting;
+            t.value = TokenValue::Indent;
+         }
+         else if (indent > next_indent) {
+            indents.pop();
+            t.category = TokenCategory::Formatting;
+            t.value = TokenValue::Unindent;
+         }
+         else {
+            ++line;
+            idx = next_indent;
+            t.category = TokenCategory::Formatting;
+            t.value = TokenValue::Justify;
+         }
+         return t;
+      }
+
+      switch (auto c = frag[line][idx++]) {
+      case '#':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Pound;
+         break;
+
+      case '@':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::At;
+         break;
+
+      case '^':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Caret;
+         break;
+
+      case '&':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Ampersand;
+         break;
+         
+      case '!':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Exclamation;
+         break;
+         
+      case '\'':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Apostrophe;
+         break;
+      case ',':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Comma;
+         break;
+         
+      case ';':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Semicolon;
+         break;
+         
+      case '`':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Backquote;
+         break;
+         
+      case '(':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::OpenParen;
+         if (idx < line_length() and frag[line][idx] == '|') {
+            ++idx;
+            t.value = TokenValue::OpenMetaParen;
+         }
+         break;
+         
+      case ')':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::CloseParen;
+         break;
+         
+      case '{':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::OpenBrace;
+         if (idx < line_length() and frag[line][idx] == '|') {
+            ++idx;
+            t.value = TokenValue::OpenMetaBrace;
+         }
+         break;
+
+      case '}':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::CloseBrace;
+         break;
+
+      case '[':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::OpenBracket;
+         if (idx < line_length() and frag[line][idx] == '|') {
+            ++idx;
+            t.value = TokenValue::OpenMetaBracket;
+         }
+         break;
+         
+      case ']':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::CloseBracket;
+         break;
+         
+      case ':':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Colon; 
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case ':': t.value = TokenValue::ColonColon; ++idx; break;
+            case '=': t.value = TokenValue::ColonEq; ++idx; break;
+            case '-': t.value = TokenValue::ColonDash; ++idx; break;
+            default: break;
+            }
+         break;
+         
+      case '*':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Star;
+         if (idx < line_length() and frag[line][idx] == '*') {
+            t.value = TokenValue::StarStar;
+            ++idx;
+         }
+         break;
+         
+      case '/':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Slash;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '/': t.value = TokenValue::SlashSlash; ++idx; break;
+            case '\\': t.value = TokenValue::SlashBackslash; ++idx; break;
+            default: break;
+            }
+         break;
+
+      case '\\':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Backslash;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '\\': t.value = TokenValue::BackslashBackslash; ++idx; break;
+            case '/': t.value = TokenValue::BackslashSlash; ++idx; break;
+            default: break;
+            }
+         break;
+         
+      case '<':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Less;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '-': t.value = TokenValue::LeftArrow; ++idx; break;
+            case '<': t.value = TokenValue::OpenChevron; ++idx; break;
+            case '=':
+               t.value = TokenValue::LessEq;
+               if (++idx < line_length() and frag[line][idx] == '>') {
+                  t.value = TokenValue::Equiv;
+                  ++idx;
+               }
+               break;
+            default: break;
+            }
+         break;
+         
+      case '=':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Eq;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '>': t.value = TokenValue::Implies; ++idx; break;
+            case '=':
+               t.value = TokenValue::EqEq;
+               if (++idx < line_length() and frag[line][idx] == '>') {
+                  t.value = TokenValue::FatArrow;
+                  ++idx;
+               }
+               break;
+            default: break;
+            }
+         break;
+         
+      case '~':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Tilde;
+         if (idx < line_length() and frag[line][idx] == '=') {
+            t.value = TokenValue::TildeEq;
+            ++idx;
+         }
+         break;
+
+      case '>':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Greater;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '=': t.value = TokenValue::GreaterEq; ++idx; break;
+            case '>': t.value = TokenValue::CloseChevron; ++idx; break;
+            }
+         break;
+
+      case '|':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Bar;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case ']': t.value = TokenValue::CloseMetaBracket; ++idx; break;
+            case '}': t.value = TokenValue::CloseMetaBrace; ++idx; break;
+            case ')': t.value = TokenValue::CloseMetaParen; ++idx; break;
+            default: break;
+            }
+         break;
+
+      case '-':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Minus; 
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '>': t.value = TokenValue::RightArrow; ++idx; break;
+            case '-':
+               t.category = TokenCategory::Comment;
+               t.value = TokenValue::Wisecrack;
+               idx = frag[line].size();
+               break;
+            }
+         break;
+
+      case '+':
+         t.category = TokenCategory::Operator;
+         t.value = TokenValue::Plus;
+         if (idx < line_length())
+            switch (frag[line][idx]) {
+            case '+':
+               t.category = TokenCategory::Comment;
+               t.value = TokenValue::Commentary;
+               idx = frag[line].size();
+               break;
+            case '-':
+               if (idx + 1 < line_length() and frag[line][idx+1] == '>') {
+                  t.value = TokenValue::MapsTo;
+                  idx += 2;
+               }
+               break;
+            default: break;
+            }
+         break;
+
+      case '.':
+         t.category = TokenCategory::Punctuator;
+         t.value = TokenValue::Dot;
+         if (idx < line_length() and frag[line][idx] == '.') {
+            t.category = TokenCategory::Operator;
+            t.value = TokenValue::DotDot;
+            ++idx;
+         }
+         break;
+
+      case '"':
+         string(frag[line], idx, t);
+         break;
+
+      case '$':
+         if (dialect != Language::Boot or idx >= line_length()
+             or separator_or_punctuator(frag[line][idx])) {
+            t.category = TokenCategory::Operator;
+            t.value = TokenValue::Dollar;
+         }
+         else
+            identifier(frag[line], idx, t, dialect);
+         break;
+
+      default:
+         if (isdigit(c))
+            number(frag[line], idx, t);
+         else if (identifier_head(c))
+            identifier(frag[line], idx, t, dialect);
+         else
+            junk(frag[line], idx, t);
+         break;
+      }
+
+      t.end = { frag[line].number, idx };
+      return t;
    }
 }
author	dos-reis <gdr@axiomatics.org>	2014-08-26 10:07:17 +0000
committer	dos-reis <gdr@axiomatics.org>	2014-08-26 10:07:17 +0000
commit	ef059f3f675f384c68c15076dbcf220be1e01eee (patch)
tree	08124f18e4f7a3044b719ae860e3b492ed704287 /src/include
parent	cfffc75b762f4364623f85a887b7e564421e3127 (diff)
download	open-axiom-ef059f3f675f384c68c15076dbcf220be1e01eee.tar.gz