scl/src/lexer.c

181 lines
4.7 KiB
C
Raw Normal View History

2024-10-13 23:46:03 -04:00
#include <ctype.h>
2024-10-19 09:09:37 -04:00
#include <stdio.h>
2024-10-13 23:46:03 -04:00
#include <string.h>
2024-10-19 09:09:37 -04:00
#include <limits.h>
2024-10-13 23:46:03 -04:00
2024-10-02 17:57:04 -04:00
#include "include/lexer.h"
2024-10-19 09:09:37 -04:00
#include "include/dstr.h"
2024-10-31 12:52:39 -04:00
#include "include/token.h"
#include "include/util.h"
2024-10-02 17:57:04 -04:00
2024-11-02 11:02:18 -04:00
Lexer* thelexer = NULL;
2024-10-02 21:04:54 -04:00
2024-11-02 11:02:18 -04:00
void lexer_init(char* src) {
thelexer = malloc(sizeof(Lexer));
thelexer->src = src;
thelexer->srcln = strlen(src);
thelexer->cchar = thelexer->src;
2024-10-02 17:57:04 -04:00
2024-11-02 11:02:18 -04:00
thelexer->tokens = calloc(TOKENS_MAX, sizeof(Token*));
thelexer->ntokens = 0;
thelexer->state = LEXER_STATE_CONFUSED;
2024-10-13 23:46:03 -04:00
2024-11-02 11:02:18 -04:00
log_dbgf("created thelexer @ %p", thelexer);
2024-10-02 17:57:04 -04:00
}
2024-10-02 21:04:54 -04:00
void lexer_destroy(Lexer* lexer) {
2024-10-31 16:44:17 -04:00
// Does not free lexer->src.
for (int i = 0; i < lexer->ntokens; i++) token_destroy(lexer->tokens[i]);
2024-10-02 17:57:04 -04:00
}
void lexer_lex(Lexer* lexer) {
while (*lexer->cchar) {
switch (lexer->state) {
2024-10-02 21:04:54 -04:00
case LEXER_STATE_CONFUSED: lexer_do_confused(lexer); break;
case LEXER_STATE_NUM: lexer_do_number(lexer); break;
case LEXER_STATE_CALL: lexer_do_call(lexer); break;
default: break;
2024-10-02 17:57:04 -04:00
}
}
}
2024-10-05 09:24:12 -04:00
2024-10-07 11:48:53 -04:00
void lexer_do_confused(Lexer* lexer) {
log_dbgf("lexer @ %p entered confused mode @ char '%c' (%d)", lexer,
*lexer->cchar, (int)*lexer->cchar);
if (isspace(*lexer->cchar)) lexer_inc(lexer);
2024-10-13 23:46:03 -04:00
2024-10-19 09:09:37 -04:00
if (isdigit(*lexer->cchar)) {
lexer->state = LEXER_STATE_NUM;
lexer_do_number(lexer);
} else {
lexer->state = LEXER_STATE_CALL;
lexer_do_call(lexer);
}
2024-10-07 11:48:53 -04:00
}
void lexer_do_number(Lexer* lexer) {
log_dbgf("lexer @ %p entered number mode @ char '%c' (%d)", lexer,
*lexer->cchar, (int)*lexer->cchar);
2024-10-13 23:46:03 -04:00
// Length of the number string.
size_t numln;
2024-10-07 11:48:53 -04:00
// Where the number string starts.
char* start = lexer->cchar;
for (numln = 0; *lexer->cchar && isdigit(*lexer->cchar); numln++)
2024-10-07 11:48:53 -04:00
lexer_inc(lexer);
char* num = malloc(numln + 1);
memcpy(num, start, numln);
num[numln] = '\0';
2024-10-07 11:48:53 -04:00
lexer_add_token(lexer, token_init(TOKEN_TYPE_NUMBER, num, numln));
2024-10-19 09:09:37 -04:00
lexer->state = LEXER_STATE_CONFUSED;
2024-10-07 11:48:53 -04:00
}
void lexer_do_call(Lexer* lexer) {
log_dbgf("lexer @ %p entered call mode @ char '%c' (%d)", lexer,
*lexer->cchar, (int)*lexer->cchar);
2024-10-13 23:46:03 -04:00
2024-10-07 11:48:53 -04:00
// Size of the call string.
size_t callln;
2024-10-07 11:48:53 -04:00
// Where the call string starts.
char* start = lexer->cchar;
for (callln = 0;
*lexer->cchar && (!isdigit(*lexer->cchar) && !isspace(*lexer->cchar));
callln++)
2024-10-07 11:48:53 -04:00
lexer_inc(lexer);
char* call = malloc(callln + 1);
memcpy(call, start, callln);
call[callln] = '\0';
lexer_add_token(lexer, token_init(TOKEN_TYPE_CALL, call, callln));
2024-10-19 09:09:37 -04:00
lexer->state = LEXER_STATE_CONFUSED;
2024-10-07 11:48:53 -04:00
}
void lexer_inc(Lexer* lexer) {
lexer->cchar += sizeof(char);
}
2024-10-05 09:24:12 -04:00
void lexer_add_token(Lexer* lexer, Token* token) {
2024-10-07 11:48:53 -04:00
assert(lexer->ntokens < TOKENS_MAX);
if (lexer->ntokens < TOKENS_MAX - 1) {
lexer->tokens[lexer->ntokens] = token;
2024-10-07 11:48:53 -04:00
lexer->ntokens++;
log_dbgf("added token (total: %ld)", lexer->ntokens);
2024-10-07 11:48:53 -04:00
}
}
void lexer_print(Lexer* lexer) { lexer_print_i(lexer, 0); }
void lexer_print_i(Lexer* lexer, int ilvl) {
2024-10-25 11:20:07 -04:00
INDENT_BEGIN(ilvl);
INDENT_TITLE("Lexer", lexer);
2024-10-31 16:05:04 -04:00
INDENT_FIELD_NONL_START("state")
lexerstate_print_raw(lexer->state);
INDENT_FIELD_NONL_END
2024-10-25 11:20:07 -04:00
INDENT_FIELD("srcln", "%ld", lexer->srcln);
INDENT_FIELD_NL("src", "\"%s\"", lexer->src);
INDENT_FIELD("cchar", "'%c'", *lexer->cchar);
INDENT_FIELD("ntokens", "%ld", lexer->ntokens);
2024-10-31 12:52:39 -04:00
INDENT_FIELD_LIST("tokens", lexer->tokens, lexer->ntokens, token_print_i);
#if 0
2024-10-25 11:20:07 -04:00
printf("%s tokens: [\n", INDENT_spacing->buf);
for (int i = 0; i < lexer->ntokens; i++) {
token_print_i(lexer->tokens[i], ilvl + 2);
printf(",\n\n");
}
2024-10-31 12:52:39 -04:00
#endif
}
2024-10-19 09:09:37 -04:00
2024-10-25 11:20:07 -04:00
void lexerstate_print_raw(LexerState s) {
2024-10-31 16:05:04 -04:00
if (s > LEXER_STATE_MAX) {
printf("Unknown (%d)", s);
log_dbgf("%d is not a valid LexerState (max: %d)", s, TOKEN_TYPE_MAX);
} else printf("%s", lexerstate_names[s]);
2024-10-19 09:09:37 -04:00
}
2024-11-02 10:31:55 -04:00
2024-11-02 11:02:18 -04:00
#include "../build/grammars/grammar.tab.h"
int yylex(void) {
if (*thelexer->cchar == '\0') return YYEOF;
switch (*thelexer->cchar) {
case ' ':
case '\t':
thelexer->cchar++;
}
// Assign & consume current character.
int c = *thelexer->cchar++;
switch (c) {
case '+':
return PLUS;
}
if (isdigit(c)) {
int value = c - '0'; // Start with the first digit
while (isdigit(*thelexer->cchar)) {
value = value * 10 + (*thelexer->cchar - '0'); // Accumulate value
thelexer++;
}
yylval.intval = value; // Set the token value
return NUM; // Return the INTEGER token type
}
fprintf(stderr, "Unexpected character: %c\n", c);
return 0;
2024-11-02 10:31:55 -04:00
}