The middle of the parser

2025-06-29 11:53:50 -07:00
parent 1cfb6f4185
commit 17d7a8bfbe
11 changed files with 652 additions and 58 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 cmake-build-*
 tfc
-.idea
+.idea
+reference
--- a/grammar/t.bnf
+++ b/grammar/t.bnf
@@ -0,0 +1,40 @@
+
+{
+    tokens = [
+        OP_MUL='*'
+        OP_DIV='/'
+        OP_ADD='+'
+        OP_SUB='-'
+        STAT_TERM=';'
+        EXIT="exit"
+        INT="[0-9]+"
+        DEC="[0-9]+\.[0-9]+"
+    ]
+}
+
+letter ::= "A" | "B" | "C" | "D" | "E" | "F" | "G"
+                  | "H" | "I" | "J" | "K" | "L" | "M" | "N"
+                  | "O" | "P" | "Q" | "R" | "S" | "T" | "U"
+                  | "V" | "W" | "X" | "Y" | "Z" | "a" | "b"
+                  | "c" | "d" | "e" | "f" | "g" | "h" | "i"
+                  | "j" | "k" | "l" | "m" | "n" | "o" | "p"
+                  | "q" | "r" | "s" | "t" | "u" | "v" | "w"
+                  | "x" | "y" | "z" ;
+
+num_lit ::= INT | DEC;
+
+l1_operator ::= OP_MUL | OP_DIV;
+l2_operator ::= OP_ADD | OP_SUB;
+math_operator ::= l1_operator | l2_operator;
+
+val ::= expr | num_lit;
+expr_val ::= num_lit;
+
+term ::= (expr_val) l1_operator (expr_val | term);
+expr ::= (expr_val | term) [l2_operator (expr_val | expr | term)];
+
+exit_stat ::= EXIT [val] STAT_TERM;
+
+statement ::= (val STAT_TERM) | exit_stat | STAT_TERM;
+
+grammar ::= {statement};
--- a/install.sh
+++ b/install.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env sh
+cp ./cmake-build-release/TFC ~/local/bin/tcs
--- a/reference/look_at_later.txt
+++ b/reference/look_at_later.txt
@@ -0,0 +1 @@
+LALR(1) parsers, recursive descent parser, EBNF (Extended Backus-Naur Form).
--- a/src/alib/optional_ptr.hpp
+++ b/src/alib/optional_ptr.hpp
@@ -0,0 +1,40 @@
+//
+// Created by skythedragon on 6/25/25.
+//
+
+#pragma once
+#include <memory>
+
+namespace skythedragon
+{
+    struct null_optional_ptr
+    {
+        void* ptr = nullptr;
+    };
+
+    inline constexpr null_optional_ptr null_opt;
+
+    template <typename T>
+    class optional_ptr
+    {
+        std::unique_ptr<T> ptr_;
+
+    public:
+        optional_ptr() : ptr_(nullptr) {}
+        explicit optional_ptr(std::unique_ptr<T> ptr) : ptr_(std::move(ptr)) {}
+        template <typename... Args>
+        explicit optional_ptr(Args... args) : ptr_(std::make_unique<T>(args...)) {}
+        optional_ptr(T& instance) : ptr_(std::make_unique<T>(instance)) {}
+        optional_ptr(null_optional_ptr) : ptr_(nullptr) {}
+
+        bool is_null() const { return ptr_ == nullptr; }
+
+        T& operator*() const { return *ptr_; }
+        T* operator->() const { return ptr_.get(); }
+        optional_ptr& operator=(std::unique_ptr<T> ptr) { ptr_ = std::move(ptr); return *this; }
+        optional_ptr& operator=(T& t) { *ptr_ = t; return *this;}
+        optional_ptr& operator=(null_optional_ptr t) { this = optional_ptr(t); return *this; }
+        operator bool() const { return ptr_ == nullptr; }
+        std::unique_ptr<T> release() { return std::move(ptr_); }
+    };
+}
--- a/src/lexer/Lexer.cpp
+++ b/src/lexer/Lexer.cpp
@@ -6,7 +6,16 @@

 #include <iostream>

-inline std::string process_octal_escape(std::ifstream& file, char c)
+static bool is_bracket(const int c)
+{
+    if (c == '(' || c == ')' || c == '{' || c == '}' || c == '[' || c == ']' || c == '<' || c == '>')
+    {
+        return true;
+    }
+    return false;
+}
+
+static std::string process_octal_escape(std::ifstream& file, char c)
 {
    std::string oct;
    oct += c;
@@ -35,7 +44,7 @@ inline std::string process_octal_escape(std::ifstream& file, char c)
    }
 }

-inline char process_hex_escape(std::ifstream& file)
+static char process_hex_escape(std::ifstream& file)
 {
    std::string hex;

@@ -68,13 +77,13 @@ void Lexer::lex()
    {
        char c = file_.get();

-        if (isalpha(c))
+        if (std::isalpha(c))
        {
            std::string lexeme;

            lexeme += c;

-            while (!std::isspace(file_.peek())  && file_.peek() != EOF)
+            while (std::isalnum(file_.peek()) || file_.peek() == '_')
            {
                lexeme += file_.get();
            }
@@ -87,7 +96,7 @@ void Lexer::lex()
            }
        }

-        if (isdigit(c))
+        if (std::isdigit(c))
        {
            std::string lexeme;
            lexeme += c;
@@ -113,7 +122,7 @@ void Lexer::lex()

            c = file_.get();

-            if (isalpha(c))
+            if (std::isalpha(c))
            {
                std::cerr << "ERR: Literal operators are not yet supported.\n";
                exit(1);
@@ -126,7 +135,7 @@ void Lexer::lex()
            }

            if (isnum) {
-                tokens_.emplace_back(TokenType::number, lexeme);
+                tokens_.emplace_back(TokenType::decimal, lexeme);
            } else {
                tokens_.emplace_back(TokenType::integer, lexeme);
            }
@@ -152,6 +161,11 @@ void Lexer::lex()

                if (c == '\\')
                {
+                    if (file_.peek() == EOF)
+                    {
+                        std::cerr << "ERR: Unterminated string\n";
+                    }
+
                    c = file_.get();

                    switch (c)
@@ -218,5 +232,117 @@ void Lexer::lex()
        {
            tokens_.emplace_back(TokenType::statement_term);
        }
+
+        if (is_bracket(c))
+        {
+            std::cerr << "ERR: None of these things are yet supported.\n";
+            exit(1);
+        }
+
+        if (c == '+')
+            tokens_.emplace_back(TokenType::op_plus);
+
+        if (c == '-')
+            tokens_.emplace_back(TokenType::op_minus);
+
+        if (c == '*')
+            tokens_.emplace_back(TokenType::op_multiply);
+
+        if (c == '/')
+            tokens_.emplace_back(TokenType::op_divide);
    }
 }
+
+bool Lexer::done(const size_t in)
+{
+    return current_token_ + in == tokens_.size();
+}
+
+
+Token Lexer::consume()
+{
+    Token token = tokens_[current_token_];
+    current_token_++;
+    return token;
+}
+
+Token Lexer::peek(const size_t ahead)
+{
+    return tokens_[current_token_ + ahead];
+}
+
+
+void Lexer::reset()
+{
+    current_token_ = 0;
+}
+
+void Lexer::backtrack(const size_t by)
+{
+    current_token_ -= by;
+}
+
+std::string to_string(const Token& token)
+{
+    std::string o = to_string(token.type);
+
+    if (token.lexeme)
+    {
+        o += ": " + *(token.lexeme);
+    }
+
+    return o;
+}
+
+std::string to_string(const TokenType& type)
+{
+    std::string o;
+
+    switch (type)
+    {
+    case TokenType::exit:
+        o = "exit";
+        break;
+    case TokenType::decimal:
+        o = "number";
+        break;
+    case TokenType::integer:
+        o = "integer";
+        break;
+    case TokenType::op_divide:
+        o = "op_divide";
+        break;
+    case TokenType::string:
+        o = "string";
+        break;
+    case TokenType::op_minus:
+        o = "op_minus";
+        break;
+    case TokenType::op_multiply:
+        o = "op_multiply";
+        break;
+    case TokenType::op_plus:
+        o = "op_plus";
+        break;
+    case TokenType::statement_term:
+        o = "statement_term";
+        break;
+    default:
+        o = "error";
+        break;
+    }
+
+    return o;
+}
+
+std::ostream& operator<<(std::ostream& os, const Token& token)
+{
+    os << to_string(token);
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const TokenType& type)
+{
+    os << to_string(type);
+    return os;
+}
--- a/src/lexer/Lexer.hpp
+++ b/src/lexer/Lexer.hpp
@@ -13,7 +13,7 @@ enum class TokenType
    statement_term,
    exit,
    integer,
-    number,
+    decimal,
    string,
    op_plus,
    op_minus,
@@ -32,9 +32,21 @@ public:
 #endif
    std::ifstream file_;
    std::vector<Token> tokens_;
+    std::vector<Token>::size_type current_token_;

 public:
    Lexer(std::ifstream& file);

    void lex();
+
+    [[nodiscard]] Token consume();
+    Token peek(size_t ahead = 0);
+    void backtrack(size_t by);
+    bool done(size_t in = 0);
+    void reset();
 };
+
+std::string to_string(const Token& token);
+std::string to_string(const TokenType& type);
+std::ostream& operator<<(std::ostream& os, const Token& token);
+std::ostream& operator<<(std::ostream& os, const TokenType& type);
--- a/src/parser/Parser.cpp
+++ b/src/parser/Parser.cpp
@@ -3,3 +3,292 @@
 //

 #include "Parser.hpp"
+
+#include <iostream>
+#include <cassert>
+
+static bool typeIsMathOperator(TokenType type);
+static bool typeIsLiteral(TokenType type);
+static bool typeIsArithmeticOperator(TokenType type);
+static bool typeIsGeometricOperator(TokenType type);
+
+Parser::Parser(Lexer& lexer) : lexer_(lexer) {}
+
+void Parser::backtrack()
+{
+    lexer_.backtrack(start_indices_.back());
+    start_indices_.pop_back();
+}
+
+void Parser::start_parse()
+{
+    start_indices_.push_back(0);
+}
+
+void Parser::end_parse()
+{
+    size_t last = start_indices_.back();
+    start_indices_.pop_back();
+    start_indices_.back() += last;
+}
+
+Token Parser::consume()
+{
+    start_indices_.back()++;
+
+    if (at_end())
+    {
+        std::cerr << "ERR: expected token, not EOF\n";
+        exit(1);
+    }
+
+    return lexer_.consume();
+}
+
+std::optional<Token> Parser::peek(const size_t count) const
+{
+    if (lexer_.done(count))
+    {
+        return std::nullopt;
+    }
+
+    return lexer_.peek(count);
+}
+
+void Parser::expect(TokenType expected, size_t ahead)
+{
+    if (peek(ahead) != std::nullopt)
+    {
+        std::cerr << "ERR: expected " << expected << ", not EOF\n";
+        exit(1);
+    }
+
+    if (peek(ahead)->type != expected)
+    {
+        std::cerr << "ERR: expected " << expected << ", not " << peek(ahead)->type << '\n';
+        exit(1);
+    }
+}
+
+
+bool Parser::at_end(const size_t in) const
+{
+    return lexer_.done(in);
+}
+
+void Parser::parse()
+{
+    start_parse();
+    while (true) {
+        optional_ptr<Statement> statement = parseStatement();
+
+        if (!statement)
+        {
+            break;
+        }
+
+        if (statement->type == StatementType::empty)
+        {
+            std::cout << "WARN: empty statement\n";
+        } else {
+            root_.statements.emplace_back(*statement);
+        }
+    }
+    start_indices_.clear();
+    lexer_.reset();
+}
+
+optional_ptr<Statement> Parser::parseStatement()
+{
+    start_parse();
+
+    optional_ptr<Statement> result(StatementType::empty);
+
+    if (at_end()) return skythedragon::null_opt;
+
+    if (peek()->type == TokenType::exit)
+    {
+        result = parseExitStatement().release();
+
+        if (!result)
+        {
+            std::cerr << "ERR: invalid exit statement\n";
+            exit(1);
+        }
+    }
+
+    if (result->type == StatementType::empty)
+    {
+        std::cerr << "ERR: invalid statement\n";
+    }
+
+    end_parse();
+    return result;
+}
+
+optional_ptr<ExitStatement> Parser::parseExitStatement()
+{
+    start_parse();
+
+    if (consume().type != TokenType::exit)
+    {
+        assert(false && "parseExitStatement called erroneously, tell the devs, and give them your code, or don't and the problem won't get fixed\n");
+    }
+
+    if (at_end())
+    {
+        std::cerr << "ERR: invalid exit statement, expected semicolon, value or expression not EOF\n";
+        exit(1);
+    }
+
+    optional_ptr<ExitStatement> result;
+    result->type = StatementType::exit;
+
+    if (peek()->type == TokenType::statement_term)
+    {
+        result->exitcode = {ValueType::literal, "0", LiteralType::integer};
+        end_parse();
+        return result;
+    }
+
+    optional_ptr<ValueNode> exitcode = parseValue();
+
+    if (!exitcode)
+    {
+        backtrack();
+        return skythedragon::null_opt;
+    }
+
+    result->exitcode = std::move(*exitcode.release());
+
+    end_parse();
+    return result;
+}
+
+optional_ptr<ValueStatement> Parser::parseValueStatement()
+{
+    start_parse();
+    optional_ptr<ValueNode> value = parseValue();
+    optional_ptr<ValueStatement> result;
+    result->type = StatementType::value;
+
+    if (!value)
+    {
+        backtrack();
+        return skythedragon::null_opt;
+    }
+
+    if (peek()->type != TokenType::statement_term)
+    {
+        backtrack();
+        return skythedragon::null_opt;
+    }
+
+    result->value = std::move(*value.release());
+
+    std::cout << "WARN: expression result unused.\n";
+    end_parse();
+    return;
+}
+
+
+optional_ptr<ValueNode> Parser::parseValue()
+{
+    start_parse();
+
+    if (!peek())
+    {
+        std::cerr << "ERR: expected value-returning expression, but got EOF.\n";
+        exit(1);
+    }
+
+    optional_ptr<ValueNode> result;
+
+    {
+        optional_ptr<ExpressionNode> expression = parseExpression();
+
+        if (expression)
+        {
+            result->type = ValueType::expression;
+            result->expression = expression.release();
+            end_parse();
+            return result;
+        }
+    }
+
+    if (typeIsLiteral(peek()->type))
+    {
+        result->type = ValueType::literal;
+
+        switch (peek()->type)
+        {
+            case TokenType::decimal:
+                result->literal_type = LiteralType::decimal;
+                break;
+            case TokenType::integer:
+                result->literal_type = LiteralType::integer;
+                break;
+            case TokenType::string:
+                result->literal_type = LiteralType::string;
+                break;
+            default:
+                backtrack();
+                return skythedragon::null_opt;
+        }
+
+        result->value = *consume().lexeme;
+    }
+
+    end_parse();
+    return result;
+}
+
+optional_ptr<ExpressionNode> Parser::parseExpression()
+{
+    start_parse();
+
+    optional_ptr<ExpressionNode> result;
+
+    if (typeIsArithmeticOperator(peek(1)->type))
+    {
+    }
+
+    if (typeIsGeometricOperator(peek(1)->type))
+    {
+        optional_ptr<TermNode> term = parseTerm();
+
+        if (!term)
+        {
+            backtrack();
+            return skythedragon::null_opt;
+        }
+
+        result->sub1_type = SubExpressionType::term;
+        result->sub1_term = term.release();
+    }
+}
+
+optional_ptr<TermNode> Parser::parseTerm()
+{
+}
+
+
+static bool typeIsMathOperator(TokenType type)
+{
+    return type == TokenType::op_plus || type == TokenType::op_minus || type == TokenType::op_multiply || type == TokenType::op_divide;
+}
+
+static bool typeIsLiteral(TokenType type)
+{
+    return type == TokenType::integer || type == TokenType::decimal || type == TokenType::string;
+}
+
+bool typeIsArithmeticOperator(TokenType type)
+{
+    return type == TokenType::op_plus || type == TokenType::op_minus;
+}
+
+bool typeIsGeometricOperator(TokenType type)
+{
+    return type == TokenType::op_multiply || type == TokenType::op_divide;
+}
--- a/src/parser/Parser.hpp
+++ b/src/parser/Parser.hpp
@@ -3,16 +3,131 @@
 //

 #pragma once
+#include <memory>
 #include <vector>

 #include "../lexer/Lexer.hpp"
+#include "../alib/optional_ptr.hpp"

-struct ParseNode {
+using skythedragon::optional_ptr;
+
+enum class StatementType
+{
+    value,
+    exit,
+    empty
 };

-struct StatementNode : ParseNode {};
+enum class LiteralType
+{
+    integer,
+    decimal,
+    string
+};

-struct RootNode : ParseNode {};
+struct Statement
+{
+    StatementType type;
+};
+
+struct RootNode
+{
+    std::vector<Statement> statements;
+};
+
+enum class SubExpressionType
+{
+    expression,
+    term,
+    value
+};
+
+enum class ValueType
+{
+    literal,
+    expression
+};
+
+struct ExprValueNode
+{
+    ValueType type;
+    std::string value;
+    std::optional<LiteralType> literal_type;
+};
+
+struct TermNode
+{
+    TokenType operator_;
+
+    SubExpressionType sub1_type;
+    SubExpressionType sub2_type;
+
+    optional_ptr<TermNode> subterm;
+
+    std::optional<ExprValueNode> sub1_value;
+    std::optional<ExprValueNode> sub2_value;
+};
+
+struct ExpressionNode
+{
+    TokenType operator_;
+
+    SubExpressionType sub1_type;
+    SubExpressionType sub2_type;
+
+    optional_ptr<ExpressionNode> subexpr;
+
+    optional_ptr<TermNode> sub1_term;
+    optional_ptr<TermNode> sub2_term;
+
+    optional_ptr<ExprValueNode> sub1_value;
+    optional_ptr<ExprValueNode> sub2_value;
+};
+
+struct ValueNode
+{
+    ValueType type;
+    std::string value;
+    std::optional<LiteralType> literal_type;
+    optional_ptr<ExpressionNode> expression;
+};
+
+struct EmptyStatement : Statement {};
+
+struct ExitStatement : Statement
+{
+    ValueNode exitcode;
+};
+
+struct ValueStatement : Statement
+{
+    ValueNode value;
+};

 class Parser {
+#ifdef TEST
+public:
+#endif
+    RootNode root_;
+    Lexer& lexer_;
+    std::vector<size_t> start_indices_;
+
+    Parser(Lexer& lexer);
+
+    void backtrack();
+    Token consume();
+    std::optional<Token> peek(size_t count = 0) const;
+    void expect(TokenType expected, size_t ahead = 0);
+    [[nodiscard]] bool at_end(size_t in = 0) const;
+    void start_parse();
+    void end_parse();
+    [[nodiscard]] optional_ptr<Statement> parseStatement();
+    [[nodiscard]] optional_ptr<ExitStatement> parseExitStatement();
+    [[nodiscard]] optional_ptr<ValueStatement> parseValueStatement();
+    [[nodiscard]] optional_ptr<ValueNode> parseValue();
+    [[nodiscard]] optional_ptr<ExprValueNode> parseExprValue();
+    [[nodiscard]] optional_ptr<ExpressionNode> parseExpression();
+    [[nodiscard]] optional_ptr<TermNode> parseTerm();
+    void parse();
+    void flatten();
 };
--- a/src/tfc.cpp
+++ b/src/tfc.cpp
@@ -6,6 +6,7 @@

 #include "alib/colors.h"
 #include "lexer/Lexer.hpp"
+#include "parser/Parser.hpp"

 using str = std::string;

@@ -13,6 +14,8 @@ int main(int argc, char** argv)
 {
    std::vector<str> args;

+    int a = 1 - - 2;
+
    for (int i = 1; i < argc; i++)
    {
        args.emplace_back(argv[i]);
@@ -26,58 +29,17 @@ int main(int argc, char** argv)

    std::ifstream input(args[0]);

-    Lexer lexer(input);
+    Lexer lexer = input;

    lexer.lex();

    for (Token& token : lexer.tokens_)
    {
-        str o;
-
-        switch (token.type)
-        {
-        case TokenType::exit:
-            o = "exit";
-            break;
-        case TokenType::number:
-            o = "number";
-            break;
-        case TokenType::integer:
-            o = "integer";
-            break;
-        case TokenType::op_divide:
-            o = "op_divide";
-            break;
-        case TokenType::string:
-            o = "string";
-            break;
-        case TokenType::op_minus:
-            o = "op_minus";
-            break;
-        case TokenType::op_multiply:
-            o = "op_multiply";
-            break;
-        case TokenType::op_plus:
-            o = "op_plus";
-            break;
-        case TokenType::statement_term:
-            o = "statement_term";
-            break;
-        default:
-            o = "error";
-            break;
-        }
-
-        std::cout << o;
-
-        if (token.lexeme)
-        {
-            std::cout << ": " << *token.lexeme;
-        }
-
-        std::cout << '\n';
+        std::cout << token << '\n';
    }

+    Parser parser(lexer);
+
    std::cout << RESET;

    return 0;
--- a/test.t
+++ b/test.t
@@ -1,2 +1,8 @@

-exit 0;
+exit 2;
+exit;
+2 * 3 + 4 / 3 * 2 - 2;
+exit 2 * 3;
+exit 2 *;
+;
+*
				`@@ -0,0 +1 @@`
				`LALR(1) parsers, recursive descent parser, EBNF (Extended Backus-Naur Form).`