aboutsummaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authordos-reis <gdr@axiomatics.org>2014-08-26 10:07:17 +0000
committerdos-reis <gdr@axiomatics.org>2014-08-26 10:07:17 +0000
commitef059f3f675f384c68c15076dbcf220be1e01eee (patch)
tree08124f18e4f7a3044b719ae860e3b492ed704287 /src/include
parentcfffc75b762f4364623f85a887b7e564421e3127 (diff)
downloadopen-axiom-ef059f3f675f384c68c15076dbcf220be1e01eee.tar.gz
Add generic Boot and Spad tokenizer.
Diffstat (limited to 'src/include')
-rw-r--r--src/include/dialect.H8
-rw-r--r--src/include/sexpr.H45
-rw-r--r--src/include/token-value.def138
-rw-r--r--src/include/token.H691
4 files changed, 714 insertions, 168 deletions
diff --git a/src/include/dialect.H b/src/include/dialect.H
index f63eac04..bcfddd04 100644
--- a/src/include/dialect.H
+++ b/src/include/dialect.H
@@ -1,4 +1,4 @@
-// Copyright (C) 2013, Gabriel Dos Reis.
+// Copyright (C) 2013-2014, Gabriel Dos Reis.
// All rights reserved.
// Written by Gabriel Dos Reis.
//
@@ -36,7 +36,11 @@
namespace OpenAxiom {
// Languages for which we have parsers.
enum class Language {
- Spad, Boot, Lisp
+ Spad = 0x1,
+ Boot = 0x2,
+ Lisp = 0x4,
+ BootSpad = Spad | Boot,
+ All = Spad | Boot | Lisp,
};
}
diff --git a/src/include/sexpr.H b/src/include/sexpr.H
index d425b6d8..84513a8b 100644
--- a/src/include/sexpr.H
+++ b/src/include/sexpr.H
@@ -1,4 +1,4 @@
-// Copyright (C) 2010-2013, Gabriel Dos Reis.
+// Copyright (C) 2010-2014, Gabriel Dos Reis.
// All rights reserved.
// Written by Gabriel Dos Reis.
//
@@ -55,28 +55,27 @@ namespace OpenAxiom {
struct Lexeme {
enum Type {
unknown, // unidentified token
- semicolon = token::value(";"), // comment
- dot = token::value("."),
- comma = token::value(","),
- open_paren = token::value("("),
- close_paren = token::value(")"),
- apostrophe = token::value("'"),
- backquote = token::value("`"),
- backslash = token::value("\\"),
- sharp_open_paren = token::value("#("),
- sharp_apostrophe = token::value("#'"),
- sharp_colon = token::value("#:"),
- sharp_plus = token::value("#+"),
- sharp_minus = token::value("#-"),
- sharp_dot = token::value("#."),
- comma_at = token::value(",@"),
- digraph_end = token::value(0xff,0xff),
- integer, // integer literal
- character, // character literal
- string, // string literal
- identifier, // plain identifier
- sharp_integer_equal, // anchor definition, #n=<form>
- sharp_integer_sharp // back reference, #n#
+ semicolon, // ";" for comment
+ dot, // "."
+ comma, // ","
+ open_paren, // "("
+ close_paren, // ")"
+ apostrophe, // "'"
+ backquote, // "`"
+ backslash, // "\\"
+ sharp_open_paren , // "#("
+ sharp_apostrophe, // "#'"
+ sharp_colon, // "#:"
+ sharp_plus, // "#+"
+ sharp_minus, // "#-"
+ sharp_dot, // "#."
+ comma_at, // ",@"
+ integer, // integer literal
+ character, // character literal
+ string, // string literal
+ identifier, // plain identifier
+ sharp_integer_equal, // anchor definition, #n=<form>
+ sharp_integer_sharp // back reference, #n#
};
std::pair<const Byte*, const Byte*> boundary;
diff --git a/src/include/token-value.def b/src/include/token-value.def
new file mode 100644
index 00000000..ea79c9a5
--- /dev/null
+++ b/src/include/token-value.def
@@ -0,0 +1,138 @@
+// Copyright (C) 2014, Gabriel Dos Reis.
+// All rights reserved.
+// Written by Gabriel Dos Reis.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in
+// the documentation and/or other materials provided with the
+// distribution.
+//
+// - Neither the name of OpenAxiom. nor the names of its contributors
+// may be used to endorse or promote products derived from this
+// software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+OPENAXIOM_DEFINE_TOKEN(Unknown, "<unknown>", Unclassified, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Apostrophe, "'", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Backquote, "`", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Bar, "|", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Dot, ".", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(DotDot, "..", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Colon, ":", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(ColonColon, "::", Operator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(ColonDash, ":-", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(ColonEq, ":=", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(At, "@", Operator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Exclamation, "!", Punctuator, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Comma, ",", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Semicolon, ";", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(Star, "*", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(StarStar, "**", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Plus, "+", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Minus, "-", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Slash, "/", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(SlashSlash, "//", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(SlashBackslash, "/\\", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Backslash, "\\", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(BackslashSlash, "\\/", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(BackslashBackslash, "\\\\", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Less, "<", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(LessEq, "<=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Greater, ">", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(GreaterEq, ">=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Eq, "=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(EqEq, "==", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Tilde, "~", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(TildeEq, "~=", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Caret, "^", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Pound, "#", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Dollar, "$", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Ampersand, "&", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(RightArrow, "->", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(LeftArrow, "<-", Operator, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Implies, "=>", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Equiv, "<=>", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(MapsTo, "+->", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(FatArrow, "==>", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenParen, "(", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(CloseParen, ")", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(OpenMetaParen, "(|", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseMetaParen, "|)", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenBracket, "[", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(CloseBracket, "]", Punctuator, Language::All)
+OPENAXIOM_DEFINE_TOKEN(OpenMetaBracket, "[|", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseMetaBracket, "|]", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenBrace, "{", Punctuator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(CloseBrace, "}", Punctuator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(OpenMetaBrace, "{|", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseMetaBrace, "|}", Punctuator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(OpenChevron, "<<", Operator, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(CloseChevron, ">>", Operator, Language::Spad)
+
+OPENAXIOM_DEFINE_TOKEN(Wisecrack, "--", Comment, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Commentary, "++", Comment, Language::BootSpad)
+
+OPENAXIOM_DEFINE_TOKEN(Add, "add", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(And, "and", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Assume, "assume", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Break, "break", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(By, "by", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Case, "case", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Catch, "catch", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Cross, "cross", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Do, "do", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Else, "else", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Exists, "exists", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Finally, "finally", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(For, "for", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Forall, "forall", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(From, "from", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Function, "function", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Has, "has", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(If, "if", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Import, "import", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(In, "in", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Inline, "inline", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Is, "is", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Isnt, "isnt", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Iterate, "iterate", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Leave, "leave", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Macro, "macro", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Mod, "mod", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Namespace, "namespace", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Of, "of", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Or, "or", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Pretend, "pretend", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Quo, "quo", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Rem, "rem", Operator, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Repeat, "repeat", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Return, "return", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Rule, "rule", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Structure, "structure", Keyword, Language::Boot)
+OPENAXIOM_DEFINE_TOKEN(Then, "then", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(Throw, "throw", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Try, "try", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Until, "until", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(With, "with", Keyword, Language::Spad)
+OPENAXIOM_DEFINE_TOKEN(Where, "where", Keyword, Language::BootSpad)
+OPENAXIOM_DEFINE_TOKEN(While, "while", Keyword, Language::BootSpad)
diff --git a/src/include/token.H b/src/include/token.H
index ef203b12..3b3b2950 100644
--- a/src/include/token.H
+++ b/src/include/token.H
@@ -1,4 +1,4 @@
-// Copyright (C) 2013, Gabriel Dos Reis.
+// Copyright (C) 2013-2014, Gabriel Dos Reis.
// All rights reserved.
// Written by Gabriel Dos Reis.
//
@@ -34,151 +34,556 @@
#define OPENAXIOM_TOKEN_included
#include <stdint.h>
+#include <stack>
+#include <iosfwd>
#include <open-axiom/Input>
+#include <open-axiom/dialect>
namespace OpenAxiom {
- namespace token {
- // -- Underlying representation of a token class.
- using base_type = uint32_t;
-
- // -- 8-bit byte data type
- using u8 = uint8_t;
-
- constexpr base_type value(u8 c) { return c; }
- constexpr base_type value(u8 hi, u8 lo) { return (hi << 8) | lo; }
- constexpr base_type value(u8 hi, u8 mi, u8 lo) {
- return (value(hi, mi) << 8) | lo;
- }
-
- // -- Type of literal strings of given number of characters.
- template<int N>
- using text_chunk = const char(&)[N+1];
-
- // -- Return the token value of certain literal strings.
- constexpr base_type value(text_chunk<0>) { return u8(); }
- constexpr base_type value(text_chunk<1> s) {
- return value(s[0]);
- }
- constexpr base_type value(text_chunk<2> s) {
- return value(s[0], s[1]);
- }
- constexpr base_type value(text_chunk<3> s) {
- return value(s[0], s[1], s[2]);
- }
-
- // -- Abstract values of tokens.
- enum Value : base_type {
- Unknown = value(""),
- Bar = value("|"),
- Dot = value("."),
- DotDot = value(".."),
- Colon = value(":"),
- ColonColon = value("::"),
- ColonDash = value(":-"),
- ColonEq = value(":="),
- At = value("@"),
- Comma = value(","),
- Semicolon = value(";"),
- Star = value("*"),
- Plus = value("+"),
- Minus = value("-"),
- Slash = value("/"),
- Backslash = value("\\"),
- SlashSlash = value("//"),
- BackslashBackslash = value("\\\\"),
- BackslashSlash = value("\\/"),
- SlashBackslash = value("/\\"),
- Less = value("<"),
- LessEq = value("<="),
- Greater = value(">"),
- GreaterEq = value(">="),
- Eq = value("="),
- EqEq = value("=="),
- Tilde = value("~"),
- TildeEq = value("~="),
- Caret = value("^"),
- Pound = value("#"),
- Dollar = value("$"),
- Ampersand = value("&"),
- OpenParen = value("("),
- CloseParen = value(")"),
- OpenBracket = value("["),
- CloseBracket = value("]"),
- OpenBrace = value("{"),
- CloseBrace = value("}"),
- OpenMetParen = value("(|"),
- CloseMetaParen = value("|)"),
- OpenMetaBracket = value("[|"),
- CloseMetaBracket = value("|]"),
- OpenMetaBrace = value("{|"),
- CloseMetaBrace = value("|}"),
- Apostrophe = value("'"),
- Backquote = value("`"),
- StarStar = value("**"),
- Implies = value("=>"),
- RightArrow = value("->"),
- LeftArrow = value("<-"),
- OpenChevron = value("<<"),
- CloseChevron = value(">>"),
- FatArrow = value("==>"),
- Equiv = value("<=>"),
- MapsTo = value("+->"),
-
- Add = value("add"),
- And = value("and"),
- By = value("by"),
- Do = value("do"),
- For = value("for"),
- Has = value("has"),
- If = value("if"),
- In = value("in"),
- Is = value("is"),
- Mod = value("mod"),
- Of = value("of"), // -- Boot only
- Or = value("or"),
- Quo = value("quo"),
- Rem = value("rem"),
- Try = value("try"),
- LastTrigraph = 0xffffff,
-
- Assume, // "assume"
- Break, // "break"
- Case, // "case"
- Catch, // "catch"
- Cross, // "cross"
- Else, // "else"
- Exists, // "exists"
- Finally, // "finally"
- From, // "from"
- Forall, // "forall"
- Function, // "function" -- Boot only
- Import, // "import"
- Inline, // "inline"
- Isnt, // "isnt"
- Iterate, // "iterate"
- Leave, // "leave"
- Macro, // "macro"
- Module, // "module" -- Boot only
- Namespace, // "namespace" -- Boot only
- Pretend, // "pretend"
- Repeat, // "repeat"
- Return, // "return"
- Rule, // "rule"
- Structure, // "structure" -- Boot only
- Then, // "then"
- Throw, // "throw"
- Until, // "until"
- With, // "with"
- Where, // "where"
- While, // "while"
-
- IntegerLiteral, // integer literal
- StringLiteral, // string literal
- FPLiteral, // floating point literal
- Indent, // new line indentation, greater than previous
- Unindent, // new line indentation, less than previous
- Justify, // align indentation with preceding line.
- };
+ // Categorization of Boot and Spad tokens.
+ enum class TokenCategory : uint8_t {
+ Unclassified, // token of unknown class
+ Whitespace, // sequence of white-space characters
+ Comment, // a description of an ignorable comment
+ Punctuator, // a punctuator character
+ Operator, // an operator both symbolic and alphabetic
+ Integer, // an integer literal
+ FloatingPoint, // a floating-point literal
+ String, // a string literal
+ Keyword, // a reserved word both symbolic and alphabetic
+ Identifier, // an identifier
+ Formatting, // a layout formatting token
+ Junk, // invalid/malformed token
+ EOS // end-of-token-stream indicator
+ };
+
+ std::ostream& operator<<(std::ostream&, TokenCategory);
+
+ // The abstract value associated with a token.
+ enum class TokenValue : uint8_t {
+#undef OPENAXIOM_DEFINE_TOKEN
+#define OPENAXIOM_DEFINE_TOKEN(T, ...) T,
+#include <open-axiom/token-value>
+#undef OPENAXIOM_DEFINE_TOKEN
+ Artificial, // Tokens after this are artificial
+ Indent, // new line indentation, greater than previous
+ Unindent, // new line indentation, less than previous
+ Justify, // align indentation with preceding line.
+
+ EndOfStream // end of token stream
+ };
+
+ std::ostream& operator<<(std::ostream&, TokenValue);
+
+ // Given a symbolic or alphabetic token, retrieve its category
+ // and associated abstract value.
+ struct TokenClassification {
+ TokenCategory category;
+ TokenValue value;
+
+ explicit operator bool() const {
+ return category != TokenCategory::Unclassified;
+ }
+ };
+
+ TokenClassification classify(const std::string&);
+
+ // Datatypes for locating lines and columns.
+ using LineNumber = std::size_t;
+ using ColumnIndex = std::size_t;
+
+ // -- Exception types
+ struct EndOfStringUnseen {
+ LineNumber line;
+ ColumnIndex column;
+ };
+
+ struct MissingExponent {
+ LineNumber line;
+ ColumnIndex column;
+ };
+
+ // Object of this datatype decompose a program fragment into a
+ // token stream. The tokens are of type indicated by Tok.
+ template<typename Frag, typename Tok>
+ struct TokenStream {
+ TokenStream(Frag& f)
+ : frag(f),
+ line(),
+ idx(frag.front().indent)
+ {
+ indents.push(idx);
+ }
+
+ bool eos() const {
+ return line >= frag.size()
+ or (line + 1 == frag.size() and idx >= frag.back().size());
+ }
+
+ Tok get(Language = Language::Spad);
+ private:
+ Frag& frag;
+ std::size_t line;
+ std::size_t idx;
+ std::stack<ColumnIndex> indents;
+
+ std::size_t line_length() const { return frag[line].size(); }
+ LineNumber next_line_number() const {
+ return line + 1 < frag.size()
+ ? frag[line + 1].number
+ : frag.back().number + 1;
+ }
+ ColumnIndex next_indentation() const {
+ return line + 1 < frag.size() ? frag[line + 1].indent : 0;
+ }
+
+ LineNumber line_number() const {
+ return line < frag.size()
+ ? frag[line].number
+ : frag.back().number + 1;
+ }
+
+ ColumnIndex column_number() const {
+ return line < frag.size() ? idx : 0;
+ }
+
+ using Locus = typename Tok::Location;
+ Locus current_locus() {
+ return { line_number(), column_number() };
+ }
+ };
+
+ bool separator_or_punctuator(uint8_t);
+
+ template<typename L, typename T>
+ static void junk(L& line, ColumnIndex& idx, T& t) {
+ while (idx < line.size() and not separator_or_punctuator(line[idx]))
+ ++idx;
+ t.category = TokenCategory::Junk;
+ }
+
+ template<typename L>
+ inline void
+ skip_whitespace(L& line, ColumnIndex& idx) {
+ while (idx < line.size() and isspace(line[idx]))
+ ++idx;
+ }
+
+ template<typename L, typename T>
+ void string(L& line, ColumnIndex& idx, T& t) {
+ bool done = false;
+ bool escape = false;
+ while (idx < line.size() && not done) {
+ switch (line[idx++]) {
+ case '_': escape = !escape; break;
+ case '"': done = !escape;
+ // fallthrough
+ default: escape = false; break;
+ }
+ }
+ if (not done)
+ throw EndOfStringUnseen{ line.number, idx };
+ t.category = TokenCategory::String;
+ }
+
+ template<typename L>
+ void skip_to_end_of_integer(L& line, ColumnIndex& idx) {
+ while (idx < line.size() and isdigit(line[idx]))
+ ++idx;
+ }
+
+ template<typename L, typename T>
+ void integer(L& line, ColumnIndex& idx, T& t) {
+ skip_to_end_of_integer(line, idx);
+ t.category = TokenCategory::Integer;
+ }
+
+ template<typename L, typename T>
+ T& number(L& line, ColumnIndex& idx, T& t) {
+ integer(line, idx, t);
+ if (idx >= line.size() or line[idx] != '.')
+ return t;
+ if (++idx >= line.size() or not isdigit(line[idx])) {
+ --idx;
+ return t;
+ }
+
+ t.category = TokenCategory::FloatingPoint;
+ skip_to_end_of_integer(line, idx);
+ if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E'))
+ return t;
+ if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-'))
+ ++idx;
+ if (idx >= line.size() or not isdigit(line[idx]))
+ throw MissingExponent{ line.number, idx };
+ skip_to_end_of_integer(line, idx);
+ return t;
+ }
+
+ inline bool
+ identifier_head(uint8_t c) {
+ return isalpha(c) or c == '%' or c == '_';
+ }
+
+ inline bool
+ identifier_part(uint8_t c) {
+ return identifier_head(c) or isdigit(c);
+ }
+
+ inline bool
+ identifier_suffix(uint8_t c) {
+ return c == '!' or c == '?';
+ }
+
+ inline bool internal_prefix(uint8_t c) {
+ return c == '%' or c == '$';
+ }
+
+ template<typename L>
+ inline void
+ skip_prefix(L& line, ColumnIndex& idx, uint8_t c) {
+ while (idx < line.size() and line[idx] == c)
+ ++idx;
+ }
+
+ template<typename L, typename T>
+ T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) {
+ t.category = TokenCategory::Identifier;
+
+ ColumnIndex start = --idx; // idx was ahead by 1.
+ if (dialect == Language::Boot and internal_prefix(line[idx]))
+ skip_prefix(line, idx, line[idx]);
+ bool saw_escape = false;
+ while (idx < line.size()) {
+ if (not identifier_part(line[idx]) and line[idx - 1] != '_')
+ break;
+ else if (line[idx] == '_')
+ saw_escape = true;
+ ++idx;
+ }
+ while (idx < line.size() and identifier_suffix(line[idx]))
+ ++idx;
+
+ if (saw_escape)
+ t.category = TokenCategory::Identifier;
+ else if (auto info = classify(line.sub_string(start, idx))) {
+ t.category = info.category;
+ t.value = info.value;
+ }
+ return t;
+ }
+
+ template<typename Frag, typename Tok>
+ Tok TokenStream<Frag, Tok>::get(Language dialect) {
+ Tok t { };
+ t.start = current_locus();
+
+ if (eos()) {
+ t.category = TokenCategory::EOS;
+ t.end = current_locus();
+ return t;
+ }
+ else if (isspace(frag[line][idx])) {
+ skip_whitespace(frag[line], idx);
+ t.category = TokenCategory::Whitespace;
+ t.end = current_locus();
+ return t;
+ }
+ else if (idx == line_length() - 1 and frag[line].back() == '_') {
+ ++line;
+ idx = frag[line].indent;
+ }
+ else if (idx == line_length()) {
+ auto indent = indents.top();
+ auto next_indent = next_indentation();
+ t.start = t.end = { next_line_number(), next_indent };
+ if (indent < next_indent) {
+ indents.push(next_indent);
+ ++line;
+ idx = next_indent;
+ t.category = TokenCategory::Formatting;
+ t.value = TokenValue::Indent;
+ }
+ else if (indent > next_indent) {
+ indents.pop();
+ t.category = TokenCategory::Formatting;
+ t.value = TokenValue::Unindent;
+ }
+ else {
+ ++line;
+ idx = next_indent;
+ t.category = TokenCategory::Formatting;
+ t.value = TokenValue::Justify;
+ }
+ return t;
+ }
+
+ switch (auto c = frag[line][idx++]) {
+ case '#':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Pound;
+ break;
+
+ case '@':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::At;
+ break;
+
+ case '^':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Caret;
+ break;
+
+ case '&':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Ampersand;
+ break;
+
+ case '!':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Exclamation;
+ break;
+
+ case '\'':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Apostrophe;
+ break;
+ case ',':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Comma;
+ break;
+
+ case ';':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Semicolon;
+ break;
+
+ case '`':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Backquote;
+ break;
+
+ case '(':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::OpenParen;
+ if (idx < line_length() and frag[line][idx] == '|') {
+ ++idx;
+ t.value = TokenValue::OpenMetaParen;
+ }
+ break;
+
+ case ')':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::CloseParen;
+ break;
+
+ case '{':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::OpenBrace;
+ if (idx < line_length() and frag[line][idx] == '|') {
+ ++idx;
+ t.value = TokenValue::OpenMetaBrace;
+ }
+ break;
+
+ case '}':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::CloseBrace;
+ break;
+
+ case '[':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::OpenBracket;
+ if (idx < line_length() and frag[line][idx] == '|') {
+ ++idx;
+ t.value = TokenValue::OpenMetaBracket;
+ }
+ break;
+
+ case ']':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::CloseBracket;
+ break;
+
+ case ':':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Colon;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case ':': t.value = TokenValue::ColonColon; ++idx; break;
+ case '=': t.value = TokenValue::ColonEq; ++idx; break;
+ case '-': t.value = TokenValue::ColonDash; ++idx; break;
+ default: break;
+ }
+ break;
+
+ case '*':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Star;
+ if (idx < line_length() and frag[line][idx] == '*') {
+ t.value = TokenValue::StarStar;
+ ++idx;
+ }
+ break;
+
+ case '/':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Slash;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '/': t.value = TokenValue::SlashSlash; ++idx; break;
+ case '\\': t.value = TokenValue::SlashBackslash; ++idx; break;
+ default: break;
+ }
+ break;
+
+ case '\\':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Backslash;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '\\': t.value = TokenValue::BackslashBackslash; ++idx; break;
+ case '/': t.value = TokenValue::BackslashSlash; ++idx; break;
+ default: break;
+ }
+ break;
+
+ case '<':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Less;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '-': t.value = TokenValue::LeftArrow; ++idx; break;
+ case '<': t.value = TokenValue::OpenChevron; ++idx; break;
+ case '=':
+ t.value = TokenValue::LessEq;
+ if (++idx < line_length() and frag[line][idx] == '>') {
+ t.value = TokenValue::Equiv;
+ ++idx;
+ }
+ break;
+ default: break;
+ }
+ break;
+
+ case '=':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Eq;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '>': t.value = TokenValue::Implies; ++idx; break;
+ case '=':
+ t.value = TokenValue::EqEq;
+ if (++idx < line_length() and frag[line][idx] == '>') {
+ t.value = TokenValue::FatArrow;
+ ++idx;
+ }
+ break;
+ default: break;
+ }
+ break;
+
+ case '~':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Tilde;
+ if (idx < line_length() and frag[line][idx] == '=') {
+ t.value = TokenValue::TildeEq;
+ ++idx;
+ }
+ break;
+
+ case '>':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Greater;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '=': t.value = TokenValue::GreaterEq; ++idx; break;
+ case '>': t.value = TokenValue::CloseChevron; ++idx; break;
+ }
+ break;
+
+ case '|':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Bar;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case ']': t.value = TokenValue::CloseMetaBracket; ++idx; break;
+ case '}': t.value = TokenValue::CloseMetaBrace; ++idx; break;
+ case ')': t.value = TokenValue::CloseMetaParen; ++idx; break;
+ default: break;
+ }
+ break;
+
+ case '-':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Minus;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '>': t.value = TokenValue::RightArrow; ++idx; break;
+ case '-':
+ t.category = TokenCategory::Comment;
+ t.value = TokenValue::Wisecrack;
+ idx = frag[line].size();
+ break;
+ }
+ break;
+
+ case '+':
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Plus;
+ if (idx < line_length())
+ switch (frag[line][idx]) {
+ case '+':
+ t.category = TokenCategory::Comment;
+ t.value = TokenValue::Commentary;
+ idx = frag[line].size();
+ break;
+ case '-':
+ if (idx + 1 < line_length() and frag[line][idx+1] == '>') {
+ t.value = TokenValue::MapsTo;
+ idx += 2;
+ }
+ break;
+ default: break;
+ }
+ break;
+
+ case '.':
+ t.category = TokenCategory::Punctuator;
+ t.value = TokenValue::Dot;
+ if (idx < line_length() and frag[line][idx] == '.') {
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::DotDot;
+ ++idx;
+ }
+ break;
+
+ case '"':
+ string(frag[line], idx, t);
+ break;
+
+ case '$':
+ if (dialect != Language::Boot or idx >= line_length()
+ or separator_or_punctuator(frag[line][idx])) {
+ t.category = TokenCategory::Operator;
+ t.value = TokenValue::Dollar;
+ }
+ else
+ identifier(frag[line], idx, t, dialect);
+ break;
+
+ default:
+ if (isdigit(c))
+ number(frag[line], idx, t);
+ else if (identifier_head(c))
+ identifier(frag[line], idx, t, dialect);
+ else
+ junk(frag[line], idx, t);
+ break;
+ }
+
+ t.end = { frag[line].number, idx };
+ return t;
}
}