aboutsummaryrefslogtreecommitdiff
path: root/src/include/open-axiom/token
diff options
context:
space:
mode:
authorGabriel Dos Reis <gdr@axiomatics.org>2015-12-23 12:50:54 -0800
committerGabriel Dos Reis <gdr@axiomatics.org>2015-12-23 12:50:54 -0800
commit72bbd833136a8a36eab1597c2f32f0890caeade8 (patch)
tree135f7a7cf53266aa8addd3aa47e07f94bf93c112 /src/include/open-axiom/token
parent33a5d910f263ad94cec761e0d93c11acfa6b7ccd (diff)
downloadopen-axiom-72bbd833136a8a36eab1597c2f32f0890caeade8.tar.gz
Arrange the source include director to mirror expected build structure instead of creating links.
Diffstat (limited to 'src/include/open-axiom/token')
-rw-r--r--src/include/open-axiom/token670
1 files changed, 670 insertions, 0 deletions
diff --git a/src/include/open-axiom/token b/src/include/open-axiom/token
new file mode 100644
index 00000000..b57b69d6
--- /dev/null
+++ b/src/include/open-axiom/token
@@ -0,0 +1,670 @@
+// Copyright (C) 2013-2014, Gabriel Dos Reis.
+// All rights reserved.
+// Written by Gabriel Dos Reis.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in
+// the documentation and/or other materials provided with the
+// distribution.
+//
+// - Neither the name of OpenAxiom. nor the names of its contributors
+// may be used to endorse or promote products derived from this
+// software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef OPENAXIOM_TOKEN_included
+#define OPENAXIOM_TOKEN_included
+
+#include <stdint.h>
+#include <stack>
+#include <iosfwd>
+#include <open-axiom/Input>
+#include <open-axiom/dialect>
+
+namespace OpenAxiom {
+ // Categorization of Boot and Spad tokens.
+ enum class TokenCategory : uint8_t {
+ Unclassified, // token of unknown class
+ Whitespace, // sequence of white-space characters
+ Comment, // a description of an ignorable comment
+ Punctuator, // a punctuator character
+ Operator, // an operator both symbolic and alphabetic
+ Integer, // an integer literal
+ FloatingPoint, // a floating-point literal
+ String, // a string literal
+ Keyword, // a reserved word both symbolic and alphabetic
+ Identifier, // an identifier
+ Formatting, // a layout formatting token
+ Junk, // invalid/malformed token
+ EOS // end-of-token-stream indicator
+ };
+
+ std::ostream& operator<<(std::ostream&, TokenCategory);
+
+ // The abstract value associated with a token.
+ enum class TokenValue : uint8_t {
+#undef OPENAXIOM_DEFINE_TOKEN
+#define OPENAXIOM_DEFINE_TOKEN(T, ...) T,
+#include <open-axiom/token-value>
+#undef OPENAXIOM_DEFINE_TOKEN
+ Artificial, // Tokens after this are artificial
+ Indent, // new line indentation, greater than previous
+ Unindent, // new line indentation, less than previous
+ Justify, // align indentation with preceding line.
+
+ EndOfStream // end of token stream
+ };
+
+ std::ostream& operator<<(std::ostream&, TokenValue);
+
+ // Datatypes for locating lines and columns.
+ using LineNumber = std::size_t;
+ using ColumnIndex = std::size_t;
+
+ struct Locus {
+ LineNumber line;
+ ColumnIndex column;
+ };
+
+ std::ostream& operator<<(std::ostream&, const Locus&);
+
+ // Program text region
+ struct Region {
+ Locus start;
+ Locus end;
+ };
+
+ // Given a symbolic or alphabetic token, retrieve its category
+ // and associated abstract value.
+ struct TokenClassification {
+ TokenCategory category;
+ TokenValue value;
+
+ explicit operator bool() const {
+ return category != TokenCategory::Unclassified;
+ }
+ };
+
+ TokenClassification classify(const std::string&);
+
+ // Token data structure: a region of text with a classification.
+ struct Token : TokenClassification, Region {
+ using Location = Locus;
+ };
+
+ // Cursor into a fragment.
+ struct FragmentCursor {
+ std::size_t line; // index of a line in a fragment
+ std::size_t column; // column number at line.
+
+ inline FragmentCursor& operator++() {
+ ++column;
+ return *this;
+ }
+
+ inline FragmentCursor operator++(int) {
+ auto tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ inline FragmentCursor& operator--() {
+ --column;
+ return *this;
+ }
+
+ inline FragmentCursor operator--(int) {
+ auto tmp = *this;
+ --*this;
+ return tmp;
+ }
+ };
+
+ // -- Exception types
+ struct EndOfStringUnseen {
+ LineNumber line;
+ ColumnIndex column;
+ };
+
+ struct MissingExponent {
+ LineNumber line;
+ ColumnIndex column;
+ };
+
+ // Object of this datatype decompose a program fragment into a
+ // token stream. The tokens are of type indicated by Tok.
+ template<typename Frag, typename Tok>
+ struct Tokenizer {
+ Tokenizer(Frag& f)
+ : frag(f),
+ pos{ 0, frag.front().indent }
+ {
+ indents.push(pos.column);
+ }
+
+ bool eos() const {
+ return pos.line >= frag.size()
+ or (pos.line + 1 == frag.size() and pos.column >= frag.back().size());
+ }
+
+ Tok get(Language = Language::Spad);
+ private:
+ Frag& frag;
+ FragmentCursor pos;
+ std::stack<ColumnIndex> indents;
+
+ std::size_t line_length() const { return frag(pos).size(); }
+ LineNumber next_line_number() const {
+ return pos.line + 1 < frag.size()
+ ? frag[pos.line + 1].number
+ : frag.back().number + 1;
+ }
+ ColumnIndex next_indentation() const {
+ return pos.line + 1 < frag.size() ? frag[pos.line + 1].indent : 0;
+ }
+
+ LineNumber line_number() const {
+ return pos.line < frag.size()
+ ? frag(pos).number
+ : frag.back().number + 1;
+ }
+
+ ColumnIndex column_number() const {
+ return pos.line < frag.size() ? pos.column : 0;
+ }
+
+ using Locus = typename Tok::Location;
+ Locus current_locus() {
+ return { line_number(), column_number() };
+ }
+ };
+
+ bool separator_or_punctuator(uint8_t);
+
+ template<typename T>
+ inline void comment_token(T& t, TokenValue v) {
+ t.category = TokenCategory::Comment;
+ t.value = v;
+ }
+
+ template<typename T>
+ inline void operator_token(T& t, TokenValue v) {
+ t.category = TokenCategory::Operator;
+ t.value = v;
+ }
+
+ template<typename T>
+ inline void punctuator_token(T& t, TokenValue v) {
+ t.category = TokenCategory::Punctuator;
+ t.value = v;
+ }
+
+ template<typename L, typename T>
+ static void junk(L& line, ColumnIndex& idx, T& t) {
+ while (idx < line.size() and not separator_or_punctuator(line[idx]))
+ ++idx;
+ t.category = TokenCategory::Junk;
+ }
+
+ template<typename L>
+ inline void
+ skip_whitespace(L& line, ColumnIndex& idx) {
+ while (idx < line.size() and isspace(line[idx]))
+ ++idx;
+ }
+
+ template<typename Frag, typename Tok>
+ void string_literal(Frag& frag, FragmentCursor& pos, Tok& t) {
+ bool done = false;
+ bool escape = false;
+ while (frag.covering(pos) && not done) {
+ switch (frag(pos)[pos.column++]) {
+ case '"': done = !escape;
+ // fallthrough
+ default: escape = false; break;
+ case '_':
+ if (pos.column == frag(pos).size()
+ and pos.line < frag.size() - 1) {
+ ++pos.line;
+ pos.column = 0;
+ }
+ else
+ escape = !escape;
+ break;
+ }
+ }
+ if (not done)
+ throw EndOfStringUnseen{ frag(pos).number, pos.column };
+ t.category = TokenCategory::String;
+ }
+
+ template<typename L>
+ void skip_to_end_of_integer(L& line, ColumnIndex& idx) {
+ while (idx < line.size() and isdigit(line[idx]))
+ ++idx;
+ }
+
+ template<typename L, typename T>
+ void integer(L& line, ColumnIndex& idx, T& t) {
+ skip_to_end_of_integer(line, idx);
+ t.category = TokenCategory::Integer;
+ }
+
+ template<typename L, typename T>
+ T& number(L& line, ColumnIndex& idx, T& t) {
+ integer(line, idx, t);
+ if (idx >= line.size() or line[idx] != '.')
+ return t;
+ if (++idx >= line.size() or not isdigit(line[idx])) {
+ --idx;
+ return t;
+ }
+
+ t.category = TokenCategory::FloatingPoint;
+ skip_to_end_of_integer(line, idx);
+ if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E'))
+ return t;
+ if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-'))
+ ++idx;
+ if (idx >= line.size() or not isdigit(line[idx]))
+ throw MissingExponent{ line.number, idx };
+ skip_to_end_of_integer(line, idx);
+ return t;
+ }
+
+ inline bool
+ identifier_head(uint8_t c) {
+ return isalpha(c) or c == '%' or c == '_';
+ }
+
+ inline bool
+ identifier_part(uint8_t c) {
+ return identifier_head(c) or isdigit(c);
+ }
+
+ inline bool
+ identifier_suffix(uint8_t c) {
+ return c == '!' or c == '?';
+ }
+
+ inline bool internal_prefix(uint8_t c) {
+ return c == '%' or c == '$';
+ }
+
+ template<typename L>
+ inline void
+ skip_prefix(L& line, ColumnIndex& idx, uint8_t c) {
+ while (idx < line.size() and line[idx] == c)
+ ++idx;
+ }
+
+ template<typename L, typename T>
+ T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) {
+ t.category = TokenCategory::Identifier;
+
+ ColumnIndex start = --idx; // idx was ahead by 1.
+ if (dialect == Language::Boot and internal_prefix(line[idx]))
+ skip_prefix(line, idx, line[idx]);
+ bool saw_escape = false;
+ while (idx < line.size()) {
+ if (not identifier_part(line[idx]) and line[idx - 1] != '_')
+ break;
+ else if (line[idx] == '_')
+ saw_escape = true;
+ ++idx;
+ }
+ while (idx < line.size() and identifier_suffix(line[idx]))
+ ++idx;
+
+ if (saw_escape)
+ t.category = TokenCategory::Identifier;
+ else if (auto info = classify(line.sub_string(start, idx))) {
+ t.category = info.category;
+ t.value = info.value;
+ }
+ return t;
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ left_paren_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ punctuator_token(t, TokenValue::OpenParen);
+ if (frag.covering(pos) and frag[pos] == '|') {
+ ++pos;
+ t.value = TokenValue::OpenMetaParen;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ left_brace_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ punctuator_token(t, TokenValue::OpenBrace);
+ if (frag.covering(pos) and frag[pos] == '|') {
+ ++pos;
+ t.value = TokenValue::OpenMetaBrace;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ left_bracket_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ punctuator_token(t, TokenValue::OpenBracket);
+ if (frag.covering(pos) and frag[pos] == '|') {
+ ++pos;
+ t.value = TokenValue::OpenMetaBracket;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ colon_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Colon);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case ':': t.value = TokenValue::ColonColon; ++pos; break;
+ case '=': t.value = TokenValue::ColonEq; ++pos; break;
+ case '-': t.value = TokenValue::ColonDash; ++pos; break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ star_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Star);
+ if (frag.covering(pos) and frag[pos] == '*') {
+ t.value = TokenValue::StarStar;
+ ++pos;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ slash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Slash);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '/': t.value = TokenValue::SlashSlash; ++pos; break;
+ case '\\': t.value = TokenValue::SlashBackslash; ++pos; break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ backslash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Backslash);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '\\': t.value = TokenValue::BackslashBackslash; ++pos; break;
+ case '/': t.value = TokenValue::BackslashSlash; ++pos; break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ less_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Less);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '-': t.value = TokenValue::LeftArrow; ++pos; break;
+ case '<': t.value = TokenValue::OpenChevron; ++pos; break;
+ case '=':
+ t.value = TokenValue::LessEq;
+ if (frag.covering(++pos) and frag[pos] == '>') {
+ t.value = TokenValue::Equiv;
+ ++pos;
+ }
+ break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ equal_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Eq);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '>': t.value = TokenValue::Implies; ++pos; break;
+ case '=':
+ t.value = TokenValue::EqEq;
+ if (frag.covering(++pos) and frag[pos] == '>') {
+ t.value = TokenValue::FatArrow;
+ ++pos;
+ }
+ break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ tilde_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Tilde);
+ if (frag.covering(pos) and frag[pos] == '=') {
+ t.value = TokenValue::TildeEq;
+ ++pos;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ greater_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Greater);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '=': t.value = TokenValue::GreaterEq; ++pos; break;
+ case '>': t.value = TokenValue::CloseChevron; ++pos; break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ bar_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ punctuator_token(t, TokenValue::Bar);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case ']': t.value = TokenValue::CloseMetaBracket; ++pos; break;
+ case '}': t.value = TokenValue::CloseMetaBrace; ++pos; break;
+ case ')': t.value = TokenValue::CloseMetaParen; ++pos; break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ minus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Minus);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '>': t.value = TokenValue::RightArrow; ++pos; break;
+ case '-':
+ comment_token(t, TokenValue::Wisecrack);
+ pos.column = frag(pos).size();
+ break;
+ }
+ }
+
+
+ template<typename Frag, typename Tok>
+ static void
+ plus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Plus);
+ if (frag.covering(pos))
+ switch (frag[pos]) {
+ case '+':
+ comment_token(t, TokenValue::Commentary);
+ pos.column = frag(pos).size();
+ break;
+ case '-':
+ if (pos.column + 1 < frag(pos).size()
+ and frag(pos)[pos.column + 1] == '>') {
+ t.value = TokenValue::MapsTo;
+ pos.column += 2;
+ }
+ break;
+ default: break;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ dot_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
+ operator_token(t, TokenValue::Dot);
+ if (frag.covering(pos) and frag[pos] == '.') {
+ t.value = TokenValue::DotDot;
+ ++pos;
+ }
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ dollar_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) {
+ if (dialect != Language::Boot or not frag.covering(pos)
+ or separator_or_punctuator(frag[pos]))
+ operator_token(t, TokenValue::Dollar);
+ else
+ identifier(frag(pos), pos.column, t, dialect);
+ }
+
+ template<typename Frag, typename Tok>
+ static void
+ sharp_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) {
+ if (dialect != Language::Lisp)
+ operator_token(t, TokenValue::Sharp);
+ else if (frag.covering(pos))
+ switch (frag[pos++]) {
+ case '(': punctuator_token(t, TokenValue::SharpLeftParen); break;
+ case '\'': operator_token(t, TokenValue::SharpApostrophe); break;
+ case ':': operator_token(t, TokenValue::SharpColon); break;
+ case '+': punctuator_token(t, TokenValue::SharpPlus); break;
+ case '-': punctuator_token(t, TokenValue::SharpMinus); break;
+ case '.': operator_token(t, TokenValue::SharpDot); break;
+ default: --pos; break;
+ }
+ }
+
+
+ template<typename Frag, typename Tok>
+ Tok Tokenizer<Frag, Tok>::get(Language dialect) {
+ Tok t { };
+ t.start = current_locus();
+
+ if (eos()) {
+ t.category = TokenCategory::EOS;
+ t.end = current_locus();
+ return t;
+ }
+ else if (isspace(frag[pos])) {
+ skip_whitespace(frag(pos), pos.column);
+ t.category = TokenCategory::Whitespace;
+ t.end = current_locus();
+ return t;
+ }
+ else if (pos.column == line_length() - 1 and frag(pos).back() == '_') {
+ ++pos.line;
+ pos.column = frag(pos).indent;
+ }
+ else if (pos.column == line_length()) {
+ auto indent = indents.top();
+ auto next_indent = next_indentation();
+ t.start = t.end = { next_line_number(), next_indent };
+ if (indent < next_indent) {
+ indents.push(next_indent);
+ ++pos.line;
+ pos.column = next_indent;
+ t.category = TokenCategory::Formatting;
+ t.value = TokenValue::Indent;
+ }
+ else if (indent > next_indent) {
+ indents.pop();
+ t.category = TokenCategory::Formatting;
+ t.value = TokenValue::Unindent;
+ }
+ else {
+ ++pos.line;
+ pos.column = next_indent;
+ t.category = TokenCategory::Formatting;
+ t.value = TokenValue::Justify;
+ }
+ return t;
+ }
+
+ switch (auto c = frag.advance(pos)) {
+ case '#': sharp_et_al(frag, pos, t, dialect); break;
+ case '@': operator_token(t, TokenValue::At); break;
+ case '^': operator_token(t, TokenValue::Caret); break;
+ case '&': punctuator_token(t, TokenValue::Ampersand); break;
+ case '!': punctuator_token(t, TokenValue::Exclamation); break;
+ case '\'': punctuator_token(t, TokenValue::Apostrophe); break;
+ case ',': punctuator_token(t, TokenValue::Comma); break;
+ case ';': punctuator_token(t, TokenValue::Semicolon); break;
+ case '`': punctuator_token(t, TokenValue::Backquote); break;
+ case '(': left_paren_et_al(frag, pos, t); break;
+ case ')': punctuator_token(t, TokenValue::CloseParen); break;
+ case '{': left_brace_et_al(frag, pos, t); break;
+ case '}': punctuator_token(t, TokenValue::CloseBrace); break;
+ case '[': left_bracket_et_al(frag, pos, t); break;
+ case ']': punctuator_token(t, TokenValue::CloseBracket); break;
+ case ':': colon_et_al(frag, pos, t); break;
+ case '*': star_et_al(frag, pos, t); break;
+ case '/': slash_et_al(frag, pos, t); break;
+ case '\\': backslash_et_al(frag, pos, t); break;
+ case '<': less_et_al(frag, pos, t); break;
+ case '=': equal_et_al(frag, pos, t); break;
+ case '~': tilde_et_al(frag, pos, t); break;
+ case '>': greater_et_al(frag, pos, t); break;
+ case '|': bar_et_al(frag, pos, t); break;
+ case '-': minus_et_al(frag, pos, t); break;
+ case '+': plus_et_al(frag, pos, t); break;
+ case '.': dot_et_al(frag, pos, t); break;
+ case '"': string_literal(frag, pos, t); break;
+ case '$': dollar_et_al(frag, pos, t, dialect); break;
+
+ default:
+ if (isdigit(c))
+ number(frag(pos), pos.column, t);
+ else if (identifier_head(c))
+ identifier(frag(pos), pos.column, t, dialect);
+ else
+ junk(frag(pos), pos.column, t);
+ break;
+ }
+
+ t.end = { frag(pos).number, pos.column };
+ return t;
+ }
+
+ // -- Token streams.
+ template<typename T>
+ struct TokenStream : std::vector<T> {
+ template<typename Frag>
+ explicit TokenStream(Frag& f, Language dialect = Language::Spad) {
+ Tokenizer<Frag, T> lex { f };
+ while (auto t = lex.get(dialect))
+ this->push_back(t);
+ }
+ };
+}
+
+#endif // OPENAXIOM_TOKEN_included