From cc66f97cf1c45ad3af2456e3fe6d4d27ab80d643 Mon Sep 17 00:00:00 2001 From: Gabriel Dos Reis Date: Sat, 13 Aug 2022 15:46:19 -0700 Subject: Use `char8_t` in the tool hammer --- src/utils/hammer.cc | 543 ++++++++++++++++++++++++++-------------------------- 1 file changed, 276 insertions(+), 267 deletions(-) diff --git a/src/utils/hammer.cc b/src/utils/hammer.cc index f4241aaf..003cd1f8 100644 --- a/src/utils/hammer.cc +++ b/src/utils/hammer.cc @@ -51,300 +51,308 @@ #include #include -namespace OpenAxiom { - namespace Hammer { - // ------------- - // -- Element -- - // ------------- - // Base class of document elements. - struct Element { - virtual ~Element() { } - }; - - // --------------- - // -- BasicText -- - // --------------- - // Plain text, with no reference to any chunk. - struct BasicText : Element { - BasicText(const Byte* f, const Byte* l) : span(f, l) { } - // Pointer to the start of this basic text element - const Byte* begin() const { return span.first; } - // One-past-the-end of the this basic text element. - const Byte* end() const { return span.second; } - private: - std::pair span; - }; - - // --------------- - // -- Reference -- - // --------------- - // Reference to a a chunk by name. - struct Reference : Element { - explicit Reference(const std::string& s) : label(s) { } - // Naame of the chunk referenced. - const std::string& name() const { return label; } - private: - const std::string label; - }; - - // ------------------- - // -- CompositeText -- - // ------------------- - // Sequence of basic elements and reference to chunks. - struct CompositeText: private std::vector { - typedef std::vector base; - using base::iterator; - using base::begin; - using base::end; - using base::size; - using base::operator[]; - - // Augment this chunk with a basic text in the open interval - // [f,l). - CompositeText& add_text(const Byte* f, const Byte* l) { - texts.push_back(BasicText(f, l)); - push_back(&texts.back()); - return *this; - } +// Workaround lack of standard streaming operation for std::u8string. +static std::ostream& print(std::ostream& os, const std::u8string& s) +{ + constexpr auto cast = [](auto p) { return reinterpret_cast(&*p); }; + std::copy(cast(s.begin()), cast(s.end()), std::ostream_iterator(os)); + return os; +} - // Augment this chunk with a reference to another chunk - // named `n'. Note that we don't attempt to check for - // possible circularities. - CompositeText reference_chunk(const Byte* f, const Byte* l) { - refs.push_back(Reference(std::string(f, l))); - push_back(&refs.back()); - return *this; - } +namespace OpenAxiom::Hammer { + // ------------- + // -- Element -- + // ------------- + // Base class of document elements. + struct Element { + virtual ~Element() = default; + }; + + // --------------- + // -- BasicText -- + // --------------- + // Plain text, with no reference to any chunk. + struct BasicText : Element { + BasicText(const char8_t* f, const char8_t* l) : span(f, l) { } + // Pointer to the start of this basic text element + const char8_t* begin() const { return span.first; } + // One-past-the-end of the this basic text element. + const char8_t* end() const { return span.second; } + private: + std::pair span; + }; + + // --------------- + // -- Reference -- + // --------------- + // Reference to a a chunk by name. + struct Reference : Element { + explicit Reference(const std::u8string& s) : label(s) { } + // Naame of the chunk referenced. + const std::u8string& name() const { return label; } + private: + const std::u8string label; + }; + + // ------------------- + // -- CompositeText -- + // ------------------- + // Sequence of basic elements and reference to chunks. + struct CompositeText: private std::vector { + typedef std::vector base; + using base::iterator; + using base::begin; + using base::end; + using base::size; + using base::operator[]; + + // Augment this chunk with a basic text in the open interval + // [f,l). + CompositeText& add_text(const char8_t* f, const char8_t* l) { + texts.push_back(BasicText(f, l)); + push_back(&texts.back()); + return *this; + } - private: - std::list texts; - std::list refs; - }; - - // -------------- - // -- Document -- - // -------------- - // A whole document; a sequence of chunks. - struct Document : std::list { - Document(const Memory::FileMapping& file) - : active_chunk(&prose), text_start(file.begin()) { - parse(file); - } + // Augment this chunk with a reference to another chunk + // named `n'. Note that we don't attempt to check for + // possible circularities. + CompositeText reference_chunk(const char8_t* f, const char8_t* l) { + refs.push_back(Reference(std::u8string(f, l))); + push_back(&refs.back()); + return *this; + } - // Return a pointer to a document chunk name `n'. - // Otherwise, return null. - CompositeText* lookup_chunk(const std::string& n) const { - ChunkTable::const_iterator i = defs.find(n); - return i == defs.end() ? 0 : i->second; - } + private: + std::list texts; + std::list refs; + }; + + // -------------- + // -- Document -- + // -------------- + // A whole document; a sequence of chunks. + struct Document : std::list { + Document(const Memory::FileMapping& file) + : active_chunk{&prose}, + text_start{(reinterpret_cast(file.begin()))} + { + parse(file); + } - private: - typedef std::map ChunkTable; - CompositeText prose; // the prose around the chunks. - ChunkTable defs; // chunk definition table. - CompositeText* active_chunk; // chunk under construction. - const Byte* text_start; // begining of current basic text. - - // Append basic text in the range `[text_start,last)' - // to the current chunk. - void finish_chunk(const Byte* last) { - if (text_start != last) - active_chunk->add_text(text_start, last); - active_chunk = &prose; - text_start = last; - } + // Return a pointer to a document chunk name `n'. + // Otherwise, return null. + CompositeText* lookup_chunk(const std::u8string& n) const { + ChunkTable::const_iterator i = defs.find(n); + return i == defs.end() ? 0 : i->second; + } - // Start a new chunk or extend an existing chunk. - void begin_chunk(const std::string& name, const Byte* start) { - if (CompositeText* chunk = lookup_chunk(name)) - active_chunk = chunk; - else { - push_back(CompositeText()); - defs[name] = active_chunk = &back(); - } - text_start = start; + private: + typedef std::map ChunkTable; + CompositeText prose; // the prose around the chunks. + ChunkTable defs; // chunk definition table. + CompositeText* active_chunk; // chunk under construction. + const char8_t* text_start; // begining of current basic text. + + // Append basic text in the range `[text_start,last)' + // to the current chunk. + void finish_chunk(const char8_t* last) { + if (text_start != last) + active_chunk->add_text(text_start, last); + active_chunk = &prose; + text_start = last; + } + + // Start a new chunk or extend an existing chunk. + void begin_chunk(const std::u8string& name, const char8_t* start) { + if (CompositeText* chunk = lookup_chunk(name)) + active_chunk = chunk; + else { + push_back(CompositeText()); + defs[name] = active_chunk = &back(); } + text_start = start; + } - // Parse a file mapping into this document. - void parse(const Memory::FileMapping&); - }; + // Parse a file mapping into this document. + void parse(const Memory::FileMapping&); + }; - // Return true if the character `c' introduces a newline. - static inline bool - looking_at_newline(char c) { - return c == '\n' or c == '\r'; - } + // Return true if the character `c' introduces a newline. + static inline bool + looking_at_newline(char8_t c) { + return c == u8'\n' or c == u8'\r'; + } - // Attempt to advance the cursor past newline marker. - // Return true on sucess. - static bool - saw_newline(const Byte*& cur, const Byte* end) { - if (*cur == '\n') { + // Attempt to advance the cursor past newline marker. + // Return true on sucess. + static bool + saw_newline(const char8_t*& cur, const char8_t* end) { + if (*cur == u8'\n') { + ++cur; + return true; + } + else if (*cur == u8'\r') { + if (++cur < end and *cur == u8'\n') ++cur; - return true; - } - else if (*cur == '\r') { - if (++cur < end and *cur == '\n') - ++cur; - return true; - } - return false; + return true; } + return false; + } - // Move `cur' to end of line or `end', whichever comes first. - // Return true if the area swept consisted only of blank characters. - static inline bool - trailing_blank(const Byte*& cur, const Byte* end) { - bool result = true; - for (; cur < end and not saw_newline(cur, end); ++cur) - result = isspace(*cur); - return result; - } + // Move `cur' to end of line or `end', whichever comes first. + // Return true if the area swept consisted only of blank characters. + static inline bool + trailing_blank(const char8_t*& cur, const char8_t* end) { + bool result = true; + for (; cur < end and not saw_newline(cur, end); ++cur) + result = isspace(*cur); + return result; + } - // Attempt to advance `cur' past the double left angle brackets - // starting a chunk name. Returm true on success. - static bool - chunk_name_began(const Byte*& cur, const Byte* end) { - if (cur[0] == '<' and cur + 1 < end and cur[1] == '<') { - cur += 2; - return true; - } - return false; + // Attempt to advance `cur' past the double left angle brackets + // starting a chunk name. Returm true on success. + static bool + chunk_name_began(const char8_t*& cur, const char8_t* end) { + if (cur[0] == u8'<' and cur + 1 < end and cur[1] == u8'<') { + cur += 2; + return true; } + return false; + } - // Attempt to move `cur' past the double right angle brackets - // terminating a chunk name. Returm true on success. - static bool - chunk_name_ended(const Byte*& cur, const Byte* end) { - if (cur[0] == '>' and cur + 1 < end and cur[1] == '>') { - cur += 2; - return true; - } - return false; + // Attempt to move `cur' past the double right angle brackets + // terminating a chunk name. Returm true on success. + static bool + chunk_name_ended(const char8_t*& cur, const char8_t* end) { + if (cur[0] == u8'>' and cur + 1 < end and cur[1] == u8'>') { + cur += 2; + return true; } + return false; + } - // We've just seen the start of a chunk reference; skip - // characters till we seen of the chunk's name. - static void - skip_to_end_of_chunk_name(const Byte*& cur, const Byte* end) { - while (cur < end) { - if (looking_at_newline(*cur) - or (cur + 1 < end and cur[0] == '>' and cur[1] == '>')) - return; - ++cur; - } + // We've just seen the start of a chunk reference; skip + // characters till we seen of the chunk's name. + static void + skip_to_end_of_chunk_name(const char8_t*& cur, const char8_t* end) { + while (cur < end) { + if (looking_at_newline(*cur) + or (cur + 1 < end and cur[0] == u8'>' and cur[1] == u8'>')) + return; + ++cur; } + } - // Move the cursor until end of line. - static void - skip_to_end_of_line(const Byte*& cur, const Byte* end) { - while (cur < end) { - if (saw_newline(cur, end)) - break; - ++cur; - } + // Move the cursor until end of line. + static void + skip_to_end_of_line(const char8_t*& cur, const char8_t* end) { + while (cur < end) { + if (saw_newline(cur, end)) + break; + ++cur; } - - void - Document::parse(const Memory::FileMapping& file) { - auto cur = text_start; - auto last = file.end(); - // Process one line at a time. - while (cur < last) { - // 1. `@' ends previous chunk - if (*cur == '@') { - auto p = cur; - if (trailing_blank(++cur, last)) - finish_chunk(p); - } - // 2. `<<' introduces a chunk reference or a chunk definition. - else if (chunk_name_began(cur, last)) { - auto label_start = cur; - skip_to_end_of_chunk_name(cur, last); - if (chunk_name_ended(cur, last)) { - auto label_end = cur - 2; - if (cur < last and *cur == '=') { - if (trailing_blank(++cur, last)) { - // chunk definition or extension - finish_chunk(label_start - 2); - begin_chunk(std::string(label_start, label_end), cur); - } - } - else if (trailing_blank(cur, last)) { - // This is just a reference to a chunk. - active_chunk->add_text(text_start, label_start - 2); - active_chunk->reference_chunk(label_start, label_end); - text_start = cur; + } + + void + Document::parse(const Memory::FileMapping& file) { + auto cur = text_start; + auto last = reinterpret_cast(file.end()); + // Process one line at a time. + while (cur < last) { + // 1. `@' ends previous chunk + if (*cur == u8'@') { + auto p = cur; + if (trailing_blank(++cur, last)) + finish_chunk(p); + } + // 2. `<<' introduces a chunk reference or a chunk definition. + else if (chunk_name_began(cur, last)) { + auto label_start = cur; + skip_to_end_of_chunk_name(cur, last); + if (chunk_name_ended(cur, last)) { + auto label_end = cur - 2; + if (cur < last and *cur == u8'=') { + if (trailing_blank(++cur, last)) { + // chunk definition or extension + finish_chunk(label_start - 2); + begin_chunk(std::u8string(label_start, label_end), cur); } - else - skip_to_end_of_line(cur, last); } + else if (trailing_blank(cur, last)) { + // This is just a reference to a chunk. + active_chunk->add_text(text_start, label_start - 2); + active_chunk->reference_chunk(label_start, label_end); + text_start = cur; + } + else + skip_to_end_of_line(cur, last); } - else - skip_to_end_of_line(cur, last); } - finish_chunk(cur); + else + skip_to_end_of_line(cur, last); } + finish_chunk(cur); + } - // Capture chunk resolution in a document. - struct resolve_chunk { - resolve_chunk(const std::string& s, const Document& f) - : name(s), doc(f) { } - const std::string name; // name of the chunk - const Document& doc; // document containing the chunk. - }; - - // Print the resolution of a chunk name onto an output stream. - std::ostream& - operator<<(std::ostream& os, const resolve_chunk& rc) { - // FIXME: no attempt at detecting circularities. - const CompositeText* doc = rc.doc.lookup_chunk(rc.name); - if (doc == 0) { - std::cerr << "chunk " << rc.name << " is undefined" << std::endl; + // Capture chunk resolution in a document. + struct resolve_chunk { + resolve_chunk(const std::u8string& s, const Document& f) + : name(s), doc(f) { } + const std::u8string name; // name of the chunk + const Document& doc; // document containing the chunk. + }; + + // Print the resolution of a chunk name onto an output stream. + std::ostream& + operator<<(std::ostream& os, const resolve_chunk& rc) { + // FIXME: no attempt at detecting circularities. + const CompositeText* doc = rc.doc.lookup_chunk(rc.name); + if (doc == 0) { + print(std::cerr << "chunk ", rc.name) << " is undefined" << std::endl; + exit(1); + } + for (std::size_t i = 0; i < doc->size(); ++i) { + const Element* elt = (*doc)[i]; + if (const BasicText* t = dynamic_cast(elt)) + std::copy(t->begin(), t->end(), + std::ostream_iterator(os)); + else if (const Reference* r = dynamic_cast(elt)) + os << resolve_chunk(r->name(), rc.doc); + else { + std::cerr << "unknown document element" << std::endl; exit(1); } - for (std::size_t i = 0; i < doc->size(); ++i) { - const Element* elt = (*doc)[i]; - if (const BasicText* t = dynamic_cast(elt)) - std::copy(t->begin(), t->end(), - std::ostream_iterator(os)); - else if (const Reference* r = dynamic_cast(elt)) - os << resolve_chunk(r->name(), rc.doc); - else { - std::cerr << "unknown document element" << std::endl; - exit(1); - } - } - - return os; } - // Return true if the `arg' is the option named`opt'. - static inline bool - is_option(const char* arg, const char* opt) { - return strcmp(arg, opt) == 0; - } + return os; + } - // `arg' is a argument on the command line. If `arg' - // does not match option name `opt', return null. Otherwise, - // return a pointer to the terminating NUL character if there - // is no specified value for that option, or a pointer to the - // start of the value. - static const char* - is_named_arg(const char* arg, const char* opt) { - const int n = strlen(opt); - int i = 0; - // Get out if argion name does not match. - // Note: Ideally, we could use strncmp(). However, that - // function is not available in C++98, so we cannot depend on it. - for (; i < n ; ++i) - if (arg[i] != opt[i]) - return 0; - - if (arg[i] == '\0') - return arg + i; // no value for the option. - return arg + n + 1; // being of the value. - } + // Return true if the `arg' is the option named`opt'. + static inline bool + is_option(const char* arg, const char* opt) { + return strcmp(arg, opt) == 0; + } + + // `arg' is a argument on the command line. If `arg' + // does not match option name `opt', return null. Otherwise, + // return a pointer to the terminating NUL character if there + // is no specified value for that option, or a pointer to the + // start of the value. + static const char* + is_named_arg(const char* arg, const char* opt) { + const int n = strlen(opt); + int i = 0; + // Get out if argion name does not match. + // Note: Ideally, we could use strncmp(). However, that + // function is not available in C++98, so we cannot depend on it. + for (; i < n ; ++i) + if (arg[i] != opt[i]) + return 0; + + if (arg[i] == '\0') + return arg + i; // no value for the option. + return arg + n + 1; // being of the value. } } @@ -353,9 +361,9 @@ int main(int argc, char* argv[]) { using namespace OpenAxiom::Hammer; int error_count = 0; - const char* chunk = 0; // chunck to tangle - const char* output_path = 0; // path to the output file - const char* input_path = 0; // path to the input file. + const char* chunk = nullptr; // chunck to tangle + const char* output_path = nullptr; // path to the output file + const char* input_path = nullptr; // path to the input file. // 1. Process command line arguments. for (int pos = 1; error_count == 0 and pos < argc; ++pos) { if (const char* val = is_named_arg(argv[pos], "--tangle")) { @@ -407,7 +415,8 @@ main(int argc, char* argv[]) { try { OpenAxiom::Memory::FileMapping file(input_path); std::ofstream os(output_path); - os << resolve_chunk(chunk, Document(file)); + auto what = reinterpret_cast(chunk); + os << resolve_chunk(what, Document(file)); } catch(const OpenAxiom::SystemError& e) { std::cerr << e.message() << std::endl; -- cgit v1.2.3