scl/src/lexer.c

#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>

#include "include/dstr.h"
#include "include/lexer.h"
#include "include/token.h"
#include "include/util.h"

#include "../build/grammars/grammar.tab.h"

extern YYSTYPE yylval;

Lexer* thelexer = NULL;

void lexer_init(char* src) {
    thelexer = malloc(sizeof(Lexer));

    thelexer->src = src;
    thelexer->srcln = strlen(src);
    thelexer->cchar = thelexer->src;

    thelexer->tokens = calloc(TOKENS_MAX, sizeof(Token*));
    thelexer->ntokens = 0;
    thelexer->state = LEXER_STATE_CONFUSED;

    log_dbgf("created thelexer @ %p", thelexer);
}

void lexer_destroy() {
    // Does not free lexer->src.
    for (int i = 0; i < thelexer->ntokens; i++)
        token_destroy(thelexer->tokens[i]);
}

void lexer_lex() {
    while (*thelexer->cchar) {
        switch (thelexer->state) {
        case LEXER_STATE_CONFUSED: lexer_do_confused(); break;
        case LEXER_STATE_NUM:      lexer_do_number(); break;
        case LEXER_STATE_CALL:     lexer_do_call(); break;
        default:                   break;
        }
    }
}

void lexer_do_confused() {
    log_dbgf("lexer @ %p entered confused mode @ char '%c' (%d)", thelexer,
             *thelexer->cchar, (int)*thelexer->cchar);

    if (isspace(*thelexer->cchar)) lexer_inc();

    if (isdigit(*thelexer->cchar)) {
        thelexer->state = LEXER_STATE_NUM;
        lexer_do_number();
    } else {
        thelexer->state = LEXER_STATE_CALL;
        lexer_do_call();
    }
}

void lexer_do_number() {
    log_dbgf("lexer @ %p entered number mode @ char '%c' (%d)", thelexer,
             *thelexer->cchar, (int)*thelexer->cchar);

    // Length of the number string.
    size_t numln;

    // Where the number string starts.
    char* start = thelexer->cchar;

    for (numln = 0; *thelexer->cchar && isdigit(*thelexer->cchar); numln++)
        lexer_inc();

    char* num = malloc(numln + 1);
    memcpy(num, start, numln);
    num[numln] = '\0';

    lexer_add_token(token_init(TOKEN_TYPE_NUMBER, num, numln));
    thelexer->state = LEXER_STATE_CONFUSED;
}

void lexer_do_call() {
    log_dbgf("lexer @ %p entered call mode @ char '%c' (%d)", thelexer,
             *thelexer->cchar, (int)*thelexer->cchar);

    // Size of the call string.
    size_t callln;

    // Where the call string starts.
    char* start = thelexer->cchar;

    for (callln = 0; *thelexer->cchar &&
                     (!isdigit(*thelexer->cchar) && !isspace(*thelexer->cchar));
         callln++)
        lexer_inc();

    char* call = malloc(callln + 1);
    memcpy(call, start, callln);
    call[callln] = '\0';

    lexer_add_token(token_init(TOKEN_TYPE_CALL, call, callln));

    thelexer->state = LEXER_STATE_CONFUSED;
}

void lexer_inc() { thelexer->cchar += sizeof(char); }

void lexer_add_token(Token* token) {
    assert(thelexer->ntokens < TOKENS_MAX);

    if (thelexer->ntokens < TOKENS_MAX - 1) {
        thelexer->tokens[thelexer->ntokens] = token;
        thelexer->ntokens++;

        log_dbgf("added token (total: %ld)", thelexer->ntokens);
    }
}

void lexer_print() { lexer_print_i(0); }

void lexer_print_i(int ilvl) {
    INDENT_BEGIN(ilvl);
    INDENT_TITLE("Lexer", thelexer);
    INDENT_FIELD_NONL_START("state")
    lexerstate_print_raw();
    INDENT_FIELD_NONL_END
    INDENT_FIELD("srcln", "%ld", thelexer->srcln);
    INDENT_FIELD_NL("src", "\"%s\"", thelexer->src);
    INDENT_FIELD("cchar", "'%c'", *thelexer->cchar);
    INDENT_FIELD("ntokens", "%ld", thelexer->ntokens);
    INDENT_FIELD_LIST("tokens", thelexer->tokens, thelexer->ntokens,
                      token_print_i);
}

void lexerstate_print_raw() {
    LexerState s = thelexer->state;
    if (s > LEXER_STATE_MAX) {
        printf("Unknown (%d)", s);
        log_dbgf("%d is not a valid LexerState (max: %d)", s, TOKEN_TYPE_MAX);
    } else printf("%s", lexerstate_names[s]);
}

int yylex() {
    if (*thelexer->cchar == '\0') return YYEOF;

    // Skip all whitespace.
    while (*thelexer->cchar == ' ' || *thelexer->cchar == '\t')
        thelexer->cchar++;

    // Assign & consume current character.
    int c = *thelexer->cchar++;

    // Check for NUM.
    if (isdigit(c)) {
        int value = c - '0';
        while (isdigit(*thelexer->cchar)) {
            value = value * 10 + (*thelexer->cchar - '0'); // Accumulate value.
            thelexer->cchar++;
        }
        yylval.intval = value; // Set the token value.
        return NUM;
    }

    switch (c) {
    case '+': return PLUS;
    default:  return CALL;
    }

    fprintf(stderr, "Unexpected character: %c\n", c);

    return 0;
}
void yyerror(char const* s) { fprintf(stderr, "%s\n", s); }
Fixed some things, broke others. 2024-10-13 23:46:03 -04:00			`#include <ctype.h>`
Changes were made. 2024-11-09 04:37:56 -05:00			`#include <limits.h>`
Something. 2024-10-19 09:09:37 -04:00			`#include <stdio.h>`
Fixed some things, broke others. 2024-10-13 23:46:03 -04:00			`#include <string.h>`

Something. 2024-10-19 09:09:37 -04:00			`#include "include/dstr.h"`
Changes were made. 2024-11-09 04:37:56 -05:00			`#include "include/lexer.h"`
Fixed print formatting. 2024-10-31 12:52:39 -04:00			`#include "include/token.h"`
Added dynamic strings. They do not yet work. 2024-10-10 16:09:25 -04:00			`#include "include/util.h"`
Beginnings of the lexer. 2024-10-02 17:57:04 -04:00
Grammars are broken. 2024-11-09 10:27:03 -05:00			`#include "../build/grammars/grammar.tab.h"`

			`extern YYSTYPE yylval;`

Worked on yylex(). 2024-11-02 11:02:18 -04:00			`Lexer* thelexer = NULL;`
Added .clang-format. 2024-10-02 21:04:54 -04:00
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`void lexer_init(char* src) {`
			`thelexer = malloc(sizeof(Lexer));`

			`thelexer->src = src;`
			`thelexer->srcln = strlen(src);`
			`thelexer->cchar = thelexer->src;`
Beginnings of the lexer. 2024-10-02 17:57:04 -04:00
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`thelexer->tokens = calloc(TOKENS_MAX, sizeof(Token*));`
			`thelexer->ntokens = 0;`
			`thelexer->state = LEXER_STATE_CONFUSED;`
Fixed some things, broke others. 2024-10-13 23:46:03 -04:00
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`log_dbgf("created thelexer @ %p", thelexer);`
Beginnings of the lexer. 2024-10-02 17:57:04 -04:00			`}`

Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_destroy() {`
Something. 2024-10-31 16:44:17 -04:00			`// Does not free lexer->src.`
Changes were made. 2024-11-09 04:37:56 -05:00			`for (int i = 0; i < thelexer->ntokens; i++)`
			`token_destroy(thelexer->tokens[i]);`
Beginnings of the lexer. 2024-10-02 17:57:04 -04:00			`}`

Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_lex() {`
			`while (*thelexer->cchar) {`
			`switch (thelexer->state) {`
			`case LEXER_STATE_CONFUSED: lexer_do_confused(); break;`
			`case LEXER_STATE_NUM: lexer_do_number(); break;`
			`case LEXER_STATE_CALL: lexer_do_call(); break;`
Added .clang-format. 2024-10-02 21:04:54 -04:00			`default: break;`
Beginnings of the lexer. 2024-10-02 17:57:04 -04:00			`}`
			`}`
			`}`
Tests are slightly not. 2024-10-05 09:24:12 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_do_confused() {`
			`log_dbgf("lexer @ %p entered confused mode @ char '%c' (%d)", thelexer,`
			`thelexer->cchar, (int)thelexer->cchar);`
Finished initial AST structures. Call and ints (nums). 2024-10-26 10:07:33 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`if (isspace(*thelexer->cchar)) lexer_inc();`
Fixed some things, broke others. 2024-10-13 23:46:03 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`if (isdigit(*thelexer->cchar)) {`
			`thelexer->state = LEXER_STATE_NUM;`
			`lexer_do_number();`
Something. 2024-10-19 09:09:37 -04:00			`} else {`
Global lexer. 2024-11-07 19:41:14 -05:00			`thelexer->state = LEXER_STATE_CALL;`
			`lexer_do_call();`
Something. 2024-10-19 09:09:37 -04:00			`}`
Changed things. 2024-10-07 11:48:53 -04:00			`}`

Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_do_number() {`
			`log_dbgf("lexer @ %p entered number mode @ char '%c' (%d)", thelexer,`
			`thelexer->cchar, (int)thelexer->cchar);`
Fixed some things, broke others. 2024-10-13 23:46:03 -04:00
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00			`// Length of the number string.`
			`size_t numln;`
Changed things. 2024-10-07 11:48:53 -04:00
			`// Where the number string starts.`
Global lexer. 2024-11-07 19:41:14 -05:00			`char* start = thelexer->cchar;`
Changed things. 2024-10-07 11:48:53 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`for (numln = 0; thelexer->cchar && isdigit(thelexer->cchar); numln++)`
			`lexer_inc();`
Changed things. 2024-10-07 11:48:53 -04:00
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00			`char* num = malloc(numln + 1);`
			`memcpy(num, start, numln);`
			`num[numln] = '\0';`
Changed things. 2024-10-07 11:48:53 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`lexer_add_token(token_init(TOKEN_TYPE_NUMBER, num, numln));`
			`thelexer->state = LEXER_STATE_CONFUSED;`
Changed things. 2024-10-07 11:48:53 -04:00			`}`

Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_do_call() {`
			`log_dbgf("lexer @ %p entered call mode @ char '%c' (%d)", thelexer,`
			`thelexer->cchar, (int)thelexer->cchar);`
Fixed some things, broke others. 2024-10-13 23:46:03 -04:00
Changed things. 2024-10-07 11:48:53 -04:00			`// Size of the call string.`
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00			`size_t callln;`
Changed things. 2024-10-07 11:48:53 -04:00
			`// Where the call string starts.`
Global lexer. 2024-11-07 19:41:14 -05:00			`char* start = thelexer->cchar;`
Changed things. 2024-10-07 11:48:53 -04:00
Changes were made. 2024-11-09 04:37:56 -05:00			`for (callln = 0; *thelexer->cchar &&`
			`(!isdigit(thelexer->cchar) && !isspace(thelexer->cchar));`
Finished initial AST structures. Call and ints (nums). 2024-10-26 10:07:33 -04:00			`callln++)`
Global lexer. 2024-11-07 19:41:14 -05:00			`lexer_inc();`
Changed things. 2024-10-07 11:48:53 -04:00
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00			`char* call = malloc(callln + 1);`
			`memcpy(call, start, callln);`
			`call[callln] = '\0';`
Added dynamic strings. They do not yet work. 2024-10-10 16:09:25 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`lexer_add_token(token_init(TOKEN_TYPE_CALL, call, callln));`
Something. 2024-10-19 09:09:37 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`thelexer->state = LEXER_STATE_CONFUSED;`
Changed things. 2024-10-07 11:48:53 -04:00			`}`

Changes were made. 2024-11-09 04:37:56 -05:00			`void lexer_inc() { thelexer->cchar += sizeof(char); }`
Changed things. 2024-10-07 11:48:53 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_add_token(Token* token) {`
			`assert(thelexer->ntokens < TOKENS_MAX);`
Changed things. 2024-10-07 11:48:53 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`if (thelexer->ntokens < TOKENS_MAX - 1) {`
			`thelexer->tokens[thelexer->ntokens] = token;`
			`thelexer->ntokens++;`
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`log_dbgf("added token (total: %ld)", thelexer->ntokens);`
Changed things. 2024-10-07 11:48:53 -04:00			`}`
			`}`

Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_print() { lexer_print_i(0); }`
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`void lexer_print_i(int ilvl) {`
Fixed more printing, added basic ast. 2024-10-25 11:20:07 -04:00			`INDENT_BEGIN(ilvl);`
Global lexer. 2024-11-07 19:41:14 -05:00			`INDENT_TITLE("Lexer", thelexer);`
Fixed more print formatting. 2024-10-31 16:05:04 -04:00			`INDENT_FIELD_NONL_START("state")`
Changes were made. 2024-11-09 04:37:56 -05:00			`lexerstate_print_raw();`
Fixed more print formatting. 2024-10-31 16:05:04 -04:00			`INDENT_FIELD_NONL_END`
Global lexer. 2024-11-07 19:41:14 -05:00			`INDENT_FIELD("srcln", "%ld", thelexer->srcln);`
			`INDENT_FIELD_NL("src", "\"%s\"", thelexer->src);`
			`INDENT_FIELD("cchar", "'%c'", *thelexer->cchar);`
			`INDENT_FIELD("ntokens", "%ld", thelexer->ntokens);`
Changes were made. 2024-11-09 04:37:56 -05:00			`INDENT_FIELD_LIST("tokens", thelexer->tokens, thelexer->ntokens,`
			`token_print_i);`
Fixed some printing errors, introduced many more. 2024-10-19 10:59:05 -04:00			`}`
Something. 2024-10-19 09:09:37 -04:00
Global lexer. 2024-11-07 19:41:14 -05:00			`void lexerstate_print_raw() {`
			`LexerState s = thelexer->state;`
Fixed more print formatting. 2024-10-31 16:05:04 -04:00			`if (s > LEXER_STATE_MAX) {`
			`printf("Unknown (%d)", s);`
			`log_dbgf("%d is not a valid LexerState (max: %d)", s, TOKEN_TYPE_MAX);`
			`} else printf("%s", lexerstate_names[s]);`
Something. 2024-10-19 09:09:37 -04:00			`}`
Begin bison-based parsing. 2024-11-02 10:31:55 -04:00
Grammars are broken. 2024-11-09 10:27:03 -05:00			`int yylex() {`
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`if (*thelexer->cchar == '\0') return YYEOF;`

Grammars are broken. 2024-11-09 10:27:03 -05:00			`// Skip all whitespace.`
			`while (thelexer->cchar == ' ' \|\| thelexer->cchar == '\t')`
			`thelexer->cchar++;`
Worked on yylex(). 2024-11-02 11:02:18 -04:00
			`// Assign & consume current character.`
			`int c = *thelexer->cchar++;`

Grammars are broken. 2024-11-09 10:27:03 -05:00			`// Check for NUM.`
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`if (isdigit(c)) {`
Grammars are broken. 2024-11-09 10:27:03 -05:00			`int value = c - '0';`
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`while (isdigit(*thelexer->cchar)) {`
Grammars are broken. 2024-11-09 10:27:03 -05:00			`value = value * 10 + (*thelexer->cchar - '0'); // Accumulate value.`
			`thelexer->cchar++;`
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`}`
Grammars are broken. 2024-11-09 10:27:03 -05:00			`yylval.intval = value; // Set the token value.`
			`return NUM;`
			`}`

			`switch (c) {`
			`case '+': return PLUS;`
			`default: return CALL;`
Worked on yylex(). 2024-11-02 11:02:18 -04:00			`}`

			`fprintf(stderr, "Unexpected character: %c\n", c);`

			`return 0;`
Begin bison-based parsing. 2024-11-02 10:31:55 -04:00			`}`
Things. 2024-11-09 11:09:57 -05:00			`void yyerror(char const* s) { fprintf(stderr, "%s\n", s); }`