aboutsummaryrefslogtreecommitdiff
path: root/src/include/token.H
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/token.H')
-rw-r--r--src/include/token.H670
1 files changed, 0 insertions, 670 deletions
diff --git a/src/include/token.H b/src/include/token.H
deleted file mode 100644
index b57b69d6..00000000
--- a/src/include/token.H
+++ /dev/null
@@ -1,670 +0,0 @@
-// Copyright (C) 2013-2014, Gabriel Dos Reis.
-// All rights reserved.
-// Written by Gabriel Dos Reis.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// - Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in
-// the documentation and/or other materials provided with the
-// distribution.
-//
-// - Neither the name of OpenAxiom. nor the names of its contributors
-// may be used to endorse or promote products derived from this
-// software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef OPENAXIOM_TOKEN_included
-#define OPENAXIOM_TOKEN_included
-
-#include <stdint.h>
-#include <stack>
-#include <iosfwd>
-#include <open-axiom/Input>
-#include <open-axiom/dialect>
-
-namespace OpenAxiom {
- // Categorization of Boot and Spad tokens.
- enum class TokenCategory : uint8_t {
- Unclassified, // token of unknown class
- Whitespace, // sequence of white-space characters
- Comment, // a description of an ignorable comment
- Punctuator, // a punctuator character
- Operator, // an operator both symbolic and alphabetic
- Integer, // an integer literal
- FloatingPoint, // a floating-point literal
- String, // a string literal
- Keyword, // a reserved word both symbolic and alphabetic
- Identifier, // an identifier
- Formatting, // a layout formatting token
- Junk, // invalid/malformed token
- EOS // end-of-token-stream indicator
- };
-
- std::ostream& operator<<(std::ostream&, TokenCategory);
-
- // The abstract value associated with a token.
- enum class TokenValue : uint8_t {
-#undef OPENAXIOM_DEFINE_TOKEN
-#define OPENAXIOM_DEFINE_TOKEN(T, ...) T,
-#include <open-axiom/token-value>
-#undef OPENAXIOM_DEFINE_TOKEN
- Artificial, // Tokens after this are artificial
- Indent, // new line indentation, greater than previous
- Unindent, // new line indentation, less than previous
- Justify, // align indentation with preceding line.
-
- EndOfStream // end of token stream
- };
-
- std::ostream& operator<<(std::ostream&, TokenValue);
-
- // Datatypes for locating lines and columns.
- using LineNumber = std::size_t;
- using ColumnIndex = std::size_t;
-
- struct Locus {
- LineNumber line;
- ColumnIndex column;
- };
-
- std::ostream& operator<<(std::ostream&, const Locus&);
-
- // Program text region
- struct Region {
- Locus start;
- Locus end;
- };
-
- // Given a symbolic or alphabetic token, retrieve its category
- // and associated abstract value.
- struct TokenClassification {
- TokenCategory category;
- TokenValue value;
-
- explicit operator bool() const {
- return category != TokenCategory::Unclassified;
- }
- };
-
- TokenClassification classify(const std::string&);
-
- // Token data structure: a region of text with a classification.
- struct Token : TokenClassification, Region {
- using Location = Locus;
- };
-
- // Cursor into a fragment.
- struct FragmentCursor {
- std::size_t line; // index of a line in a fragment
- std::size_t column; // column number at line.
-
- inline FragmentCursor& operator++() {
- ++column;
- return *this;
- }
-
- inline FragmentCursor operator++(int) {
- auto tmp = *this;
- ++*this;
- return tmp;
- }
-
- inline FragmentCursor& operator--() {
- --column;
- return *this;
- }
-
- inline FragmentCursor operator--(int) {
- auto tmp = *this;
- --*this;
- return tmp;
- }
- };
-
- // -- Exception types
- struct EndOfStringUnseen {
- LineNumber line;
- ColumnIndex column;
- };
-
- struct MissingExponent {
- LineNumber line;
- ColumnIndex column;
- };
-
- // Object of this datatype decompose a program fragment into a
- // token stream. The tokens are of type indicated by Tok.
- template<typename Frag, typename Tok>
- struct Tokenizer {
- Tokenizer(Frag& f)
- : frag(f),
- pos{ 0, frag.front().indent }
- {
- indents.push(pos.column);
- }
-
- bool eos() const {
- return pos.line >= frag.size()
- or (pos.line + 1 == frag.size() and pos.column >= frag.back().size());
- }
-
- Tok get(Language = Language::Spad);
- private:
- Frag& frag;
- FragmentCursor pos;
- std::stack<ColumnIndex> indents;
-
- std::size_t line_length() const { return frag(pos).size(); }
- LineNumber next_line_number() const {
- return pos.line + 1 < frag.size()
- ? frag[pos.line + 1].number
- : frag.back().number + 1;
- }
- ColumnIndex next_indentation() const {
- return pos.line + 1 < frag.size() ? frag[pos.line + 1].indent : 0;
- }
-
- LineNumber line_number() const {
- return pos.line < frag.size()
- ? frag(pos).number
- : frag.back().number + 1;
- }
-
- ColumnIndex column_number() const {
- return pos.line < frag.size() ? pos.column : 0;
- }
-
- using Locus = typename Tok::Location;
- Locus current_locus() {
- return { line_number(), column_number() };
- }
- };
-
- bool separator_or_punctuator(uint8_t);
-
- template<typename T>
- inline void comment_token(T& t, TokenValue v) {
- t.category = TokenCategory::Comment;
- t.value = v;
- }
-
- template<typename T>
- inline void operator_token(T& t, TokenValue v) {
- t.category = TokenCategory::Operator;
- t.value = v;
- }
-
- template<typename T>
- inline void punctuator_token(T& t, TokenValue v) {
- t.category = TokenCategory::Punctuator;
- t.value = v;
- }
-
- template<typename L, typename T>
- static void junk(L& line, ColumnIndex& idx, T& t) {
- while (idx < line.size() and not separator_or_punctuator(line[idx]))
- ++idx;
- t.category = TokenCategory::Junk;
- }
-
- template<typename L>
- inline void
- skip_whitespace(L& line, ColumnIndex& idx) {
- while (idx < line.size() and isspace(line[idx]))
- ++idx;
- }
-
- template<typename Frag, typename Tok>
- void string_literal(Frag& frag, FragmentCursor& pos, Tok& t) {
- bool done = false;
- bool escape = false;
- while (frag.covering(pos) && not done) {
- switch (frag(pos)[pos.column++]) {
- case '"': done = !escape;
- // fallthrough
- default: escape = false; break;
- case '_':
- if (pos.column == frag(pos).size()
- and pos.line < frag.size() - 1) {
- ++pos.line;
- pos.column = 0;
- }
- else
- escape = !escape;
- break;
- }
- }
- if (not done)
- throw EndOfStringUnseen{ frag(pos).number, pos.column };
- t.category = TokenCategory::String;
- }
-
- template<typename L>
- void skip_to_end_of_integer(L& line, ColumnIndex& idx) {
- while (idx < line.size() and isdigit(line[idx]))
- ++idx;
- }
-
- template<typename L, typename T>
- void integer(L& line, ColumnIndex& idx, T& t) {
- skip_to_end_of_integer(line, idx);
- t.category = TokenCategory::Integer;
- }
-
- template<typename L, typename T>
- T& number(L& line, ColumnIndex& idx, T& t) {
- integer(line, idx, t);
- if (idx >= line.size() or line[idx] != '.')
- return t;
- if (++idx >= line.size() or not isdigit(line[idx])) {
- --idx;
- return t;
- }
-
- t.category = TokenCategory::FloatingPoint;
- skip_to_end_of_integer(line, idx);
- if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E'))
- return t;
- if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-'))
- ++idx;
- if (idx >= line.size() or not isdigit(line[idx]))
- throw MissingExponent{ line.number, idx };
- skip_to_end_of_integer(line, idx);
- return t;
- }
-
- inline bool
- identifier_head(uint8_t c) {
- return isalpha(c) or c == '%' or c == '_';
- }
-
- inline bool
- identifier_part(uint8_t c) {
- return identifier_head(c) or isdigit(c);
- }
-
- inline bool
- identifier_suffix(uint8_t c) {
- return c == '!' or c == '?';
- }
-
- inline bool internal_prefix(uint8_t c) {
- return c == '%' or c == '$';
- }
-
- template<typename L>
- inline void
- skip_prefix(L& line, ColumnIndex& idx, uint8_t c) {
- while (idx < line.size() and line[idx] == c)
- ++idx;
- }
-
- template<typename L, typename T>
- T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) {
- t.category = TokenCategory::Identifier;
-
- ColumnIndex start = --idx; // idx was ahead by 1.
- if (dialect == Language::Boot and internal_prefix(line[idx]))
- skip_prefix(line, idx, line[idx]);
- bool saw_escape = false;
- while (idx < line.size()) {
- if (not identifier_part(line[idx]) and line[idx - 1] != '_')
- break;
- else if (line[idx] == '_')
- saw_escape = true;
- ++idx;
- }
- while (idx < line.size() and identifier_suffix(line[idx]))
- ++idx;
-
- if (saw_escape)
- t.category = TokenCategory::Identifier;
- else if (auto info = classify(line.sub_string(start, idx))) {
- t.category = info.category;
- t.value = info.value;
- }
- return t;
- }
-
- template<typename Frag, typename Tok>
- static void
- left_paren_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- punctuator_token(t, TokenValue::OpenParen);
- if (frag.covering(pos) and frag[pos] == '|') {
- ++pos;
- t.value = TokenValue::OpenMetaParen;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- left_brace_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- punctuator_token(t, TokenValue::OpenBrace);
- if (frag.covering(pos) and frag[pos] == '|') {
- ++pos;
- t.value = TokenValue::OpenMetaBrace;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- left_bracket_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- punctuator_token(t, TokenValue::OpenBracket);
- if (frag.covering(pos) and frag[pos] == '|') {
- ++pos;
- t.value = TokenValue::OpenMetaBracket;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- colon_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Colon);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case ':': t.value = TokenValue::ColonColon; ++pos; break;
- case '=': t.value = TokenValue::ColonEq; ++pos; break;
- case '-': t.value = TokenValue::ColonDash; ++pos; break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- star_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Star);
- if (frag.covering(pos) and frag[pos] == '*') {
- t.value = TokenValue::StarStar;
- ++pos;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- slash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Slash);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '/': t.value = TokenValue::SlashSlash; ++pos; break;
- case '\\': t.value = TokenValue::SlashBackslash; ++pos; break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- backslash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Backslash);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '\\': t.value = TokenValue::BackslashBackslash; ++pos; break;
- case '/': t.value = TokenValue::BackslashSlash; ++pos; break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- less_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Less);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '-': t.value = TokenValue::LeftArrow; ++pos; break;
- case '<': t.value = TokenValue::OpenChevron; ++pos; break;
- case '=':
- t.value = TokenValue::LessEq;
- if (frag.covering(++pos) and frag[pos] == '>') {
- t.value = TokenValue::Equiv;
- ++pos;
- }
- break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- equal_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Eq);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '>': t.value = TokenValue::Implies; ++pos; break;
- case '=':
- t.value = TokenValue::EqEq;
- if (frag.covering(++pos) and frag[pos] == '>') {
- t.value = TokenValue::FatArrow;
- ++pos;
- }
- break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- tilde_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Tilde);
- if (frag.covering(pos) and frag[pos] == '=') {
- t.value = TokenValue::TildeEq;
- ++pos;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- greater_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Greater);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '=': t.value = TokenValue::GreaterEq; ++pos; break;
- case '>': t.value = TokenValue::CloseChevron; ++pos; break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- bar_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- punctuator_token(t, TokenValue::Bar);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case ']': t.value = TokenValue::CloseMetaBracket; ++pos; break;
- case '}': t.value = TokenValue::CloseMetaBrace; ++pos; break;
- case ')': t.value = TokenValue::CloseMetaParen; ++pos; break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- minus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Minus);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '>': t.value = TokenValue::RightArrow; ++pos; break;
- case '-':
- comment_token(t, TokenValue::Wisecrack);
- pos.column = frag(pos).size();
- break;
- }
- }
-
-
- template<typename Frag, typename Tok>
- static void
- plus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Plus);
- if (frag.covering(pos))
- switch (frag[pos]) {
- case '+':
- comment_token(t, TokenValue::Commentary);
- pos.column = frag(pos).size();
- break;
- case '-':
- if (pos.column + 1 < frag(pos).size()
- and frag(pos)[pos.column + 1] == '>') {
- t.value = TokenValue::MapsTo;
- pos.column += 2;
- }
- break;
- default: break;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- dot_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
- operator_token(t, TokenValue::Dot);
- if (frag.covering(pos) and frag[pos] == '.') {
- t.value = TokenValue::DotDot;
- ++pos;
- }
- }
-
- template<typename Frag, typename Tok>
- static void
- dollar_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) {
- if (dialect != Language::Boot or not frag.covering(pos)
- or separator_or_punctuator(frag[pos]))
- operator_token(t, TokenValue::Dollar);
- else
- identifier(frag(pos), pos.column, t, dialect);
- }
-
- template<typename Frag, typename Tok>
- static void
- sharp_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) {
- if (dialect != Language::Lisp)
- operator_token(t, TokenValue::Sharp);
- else if (frag.covering(pos))
- switch (frag[pos++]) {
- case '(': punctuator_token(t, TokenValue::SharpLeftParen); break;
- case '\'': operator_token(t, TokenValue::SharpApostrophe); break;
- case ':': operator_token(t, TokenValue::SharpColon); break;
- case '+': punctuator_token(t, TokenValue::SharpPlus); break;
- case '-': punctuator_token(t, TokenValue::SharpMinus); break;
- case '.': operator_token(t, TokenValue::SharpDot); break;
- default: --pos; break;
- }
- }
-
-
- template<typename Frag, typename Tok>
- Tok Tokenizer<Frag, Tok>::get(Language dialect) {
- Tok t { };
- t.start = current_locus();
-
- if (eos()) {
- t.category = TokenCategory::EOS;
- t.end = current_locus();
- return t;
- }
- else if (isspace(frag[pos])) {
- skip_whitespace(frag(pos), pos.column);
- t.category = TokenCategory::Whitespace;
- t.end = current_locus();
- return t;
- }
- else if (pos.column == line_length() - 1 and frag(pos).back() == '_') {
- ++pos.line;
- pos.column = frag(pos).indent;
- }
- else if (pos.column == line_length()) {
- auto indent = indents.top();
- auto next_indent = next_indentation();
- t.start = t.end = { next_line_number(), next_indent };
- if (indent < next_indent) {
- indents.push(next_indent);
- ++pos.line;
- pos.column = next_indent;
- t.category = TokenCategory::Formatting;
- t.value = TokenValue::Indent;
- }
- else if (indent > next_indent) {
- indents.pop();
- t.category = TokenCategory::Formatting;
- t.value = TokenValue::Unindent;
- }
- else {
- ++pos.line;
- pos.column = next_indent;
- t.category = TokenCategory::Formatting;
- t.value = TokenValue::Justify;
- }
- return t;
- }
-
- switch (auto c = frag.advance(pos)) {
- case '#': sharp_et_al(frag, pos, t, dialect); break;
- case '@': operator_token(t, TokenValue::At); break;
- case '^': operator_token(t, TokenValue::Caret); break;
- case '&': punctuator_token(t, TokenValue::Ampersand); break;
- case '!': punctuator_token(t, TokenValue::Exclamation); break;
- case '\'': punctuator_token(t, TokenValue::Apostrophe); break;
- case ',': punctuator_token(t, TokenValue::Comma); break;
- case ';': punctuator_token(t, TokenValue::Semicolon); break;
- case '`': punctuator_token(t, TokenValue::Backquote); break;
- case '(': left_paren_et_al(frag, pos, t); break;
- case ')': punctuator_token(t, TokenValue::CloseParen); break;
- case '{': left_brace_et_al(frag, pos, t); break;
- case '}': punctuator_token(t, TokenValue::CloseBrace); break;
- case '[': left_bracket_et_al(frag, pos, t); break;
- case ']': punctuator_token(t, TokenValue::CloseBracket); break;
- case ':': colon_et_al(frag, pos, t); break;
- case '*': star_et_al(frag, pos, t); break;
- case '/': slash_et_al(frag, pos, t); break;
- case '\\': backslash_et_al(frag, pos, t); break;
- case '<': less_et_al(frag, pos, t); break;
- case '=': equal_et_al(frag, pos, t); break;
- case '~': tilde_et_al(frag, pos, t); break;
- case '>': greater_et_al(frag, pos, t); break;
- case '|': bar_et_al(frag, pos, t); break;
- case '-': minus_et_al(frag, pos, t); break;
- case '+': plus_et_al(frag, pos, t); break;
- case '.': dot_et_al(frag, pos, t); break;
- case '"': string_literal(frag, pos, t); break;
- case '$': dollar_et_al(frag, pos, t, dialect); break;
-
- default:
- if (isdigit(c))
- number(frag(pos), pos.column, t);
- else if (identifier_head(c))
- identifier(frag(pos), pos.column, t, dialect);
- else
- junk(frag(pos), pos.column, t);
- break;
- }
-
- t.end = { frag(pos).number, pos.column };
- return t;
- }
-
- // -- Token streams.
- template<typename T>
- struct TokenStream : std::vector<T> {
- template<typename Frag>
- explicit TokenStream(Frag& f, Language dialect = Language::Spad) {
- Tokenizer<Frag, T> lex { f };
- while (auto t = lex.get(dialect))
- this->push_back(t);
- }
- };
-}
-
-#endif // OPENAXIOM_TOKEN_included