From 3fb5c44ffacecfc10b6d4caaf5fb4f7a22f763df Mon Sep 17 00:00:00 2001 From: Johannes Janssen Date: Sun, 2 Mar 2025 23:58:29 +0100 Subject: [PATCH] Tokenizer now supports line numbers --- src/faust-lib/parser/faustLexer.cpp | 39 +++++++++++++++++++++++++ src/faust-lib/parser/faustLexer.h | 45 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 src/faust-lib/parser/faustLexer.cpp create mode 100644 src/faust-lib/parser/faustLexer.h diff --git a/src/faust-lib/parser/faustLexer.cpp b/src/faust-lib/parser/faustLexer.cpp new file mode 100644 index 0000000..be2df76 --- /dev/null +++ b/src/faust-lib/parser/faustLexer.cpp @@ -0,0 +1,39 @@ +#include "faustLexer.h" + +namespace faust { + +FaustLexer::FaustLexer() : lexer(TK_EOF) { + lexer.addToken(u"\\d+", TK_DECIMAL); // Order of operations important + lexer.addToken(u"\\d+\\.\\d+", TK_FLOATING_DECIMAL); + lexer.addToken(u"0x[0-9A-Fa-f]+", TK_HEXADECIMAL); + lexer.addToken(u"0b[01]+", TK_BINARY); + lexer.addToken(u"([\"'])(?:\\\\\\1|.)*?\\1", TK_STRING); + lexer.addToken(u"\\(", TK_LEFT_ROUND_BRACKET); + lexer.addToken(u"\\)", TK_RIGHT_ROUND_BRACKET); + lexer.addToken(u"\\[", TK_LEFT_SQUARE_BRACKET); + lexer.addToken(u"\\]", TK_RIGHT_SQUARE_BRACKET); + lexer.addToken(u"\\{", TK_LEFT_CURLY_BRACKET); + lexer.addToken(u"\\}", TK_RIGHT_CURLY_BRACKET); + lexer.addToken(u"\\=", TK_ASSIGN); + lexer.addToken(u"->", TK_ARROW); + lexer.addToken(u":", TK_COLON); + lexer.addToken(u";", TK_SEMICOLON); + lexer.addToken(u"\\/\\/.*$", TK_COMMENT); + lexer.addToken(u"\\bfn\\b", TK_FN); + lexer.addToken(u"\\bconst\\b", TK_CONST); + lexer.addToken(u"\\bmut\\b", TK_MUT); + lexer.addToken(u"\\bimport\\b", TK_IMPORT); + lexer.addToken(u"[a-zA-Z][a-zA-Z0-9_]*", TK_IDENTIFIER); +} + +Vector::Lexeme> FaustLexer::lex(const icu::UnicodeString &text) { + lexer.setText(text); + Vector::Lexeme> lexemes; + while (lexer.hasNext()) { + Lexer::Lexeme next = lexer.next(); + lexemes.push_back(next); + } + return lexemes; +} + +} // namespace faust diff --git a/src/faust-lib/parser/faustLexer.h b/src/faust-lib/parser/faustLexer.h new file mode 100644 index 0000000..2887103 --- /dev/null +++ b/src/faust-lib/parser/faustLexer.h @@ -0,0 +1,45 @@ +#pragma once + +#include "faust-lib/common/vector.h" + +#include "faust-lib/parser/lexer.h" + +namespace faust { + +enum FaustTokens { + TK_EOF = 0, + TK_DECIMAL, + TK_FLOATING_DECIMAL, + TK_HEXADECIMAL, + TK_BINARY, + TK_STRING, + TK_LEFT_ROUND_BRACKET, + TK_RIGHT_ROUND_BRACKET, + TK_LEFT_SQUARE_BRACKET, + TK_RIGHT_SQUARE_BRACKET, + TK_LEFT_CURLY_BRACKET, + TK_RIGHT_CURLY_BRACKET, + TK_ASSIGN, + TK_ARROW, + TK_COLON, + TK_SEMICOLON, + TK_COMMENT, + TK_FN, + TK_CONST, + TK_MUT, + TK_IMPORT, + TK_IDENTIFIER, +}; + +class FaustLexer { +public: + FaustLexer(); + Vector::Lexeme> lex(const icu::UnicodeString &text); + +private: + Lexer lexer; + +protected: +}; + +} // namespace faust