2024-10-13 23:46:03 -04:00
|
|
|
#include <ctype.h>
|
2024-11-09 04:37:56 -05:00
|
|
|
#include <limits.h>
|
2024-10-19 09:09:37 -04:00
|
|
|
#include <stdio.h>
|
2024-10-13 23:46:03 -04:00
|
|
|
#include <string.h>
|
|
|
|
|
2024-10-19 09:09:37 -04:00
|
|
|
#include "include/dstr.h"
|
2024-11-09 04:37:56 -05:00
|
|
|
#include "include/lexer.h"
|
2024-10-31 12:52:39 -04:00
|
|
|
#include "include/token.h"
|
2024-10-10 16:09:25 -04:00
|
|
|
#include "include/util.h"
|
2024-10-02 17:57:04 -04:00
|
|
|
|
2024-11-09 10:27:03 -05:00
|
|
|
#include "../build/grammars/grammar.tab.h"
|
|
|
|
|
|
|
|
extern YYSTYPE yylval;
|
|
|
|
|
2024-11-02 11:02:18 -04:00
|
|
|
Lexer* thelexer = NULL;
|
2024-10-02 21:04:54 -04:00
|
|
|
|
2024-11-02 11:02:18 -04:00
|
|
|
void lexer_init(char* src) {
|
|
|
|
thelexer = malloc(sizeof(Lexer));
|
|
|
|
|
|
|
|
thelexer->src = src;
|
|
|
|
thelexer->srcln = strlen(src);
|
|
|
|
thelexer->cchar = thelexer->src;
|
2024-10-02 17:57:04 -04:00
|
|
|
|
2024-11-02 11:02:18 -04:00
|
|
|
thelexer->tokens = calloc(TOKENS_MAX, sizeof(Token*));
|
|
|
|
thelexer->ntokens = 0;
|
|
|
|
thelexer->state = LEXER_STATE_CONFUSED;
|
2024-10-13 23:46:03 -04:00
|
|
|
|
2024-11-02 11:02:18 -04:00
|
|
|
log_dbgf("created thelexer @ %p", thelexer);
|
2024-10-02 17:57:04 -04:00
|
|
|
}
|
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_destroy() {
|
2024-10-31 16:44:17 -04:00
|
|
|
// Does not free lexer->src.
|
2024-11-09 04:37:56 -05:00
|
|
|
for (int i = 0; i < thelexer->ntokens; i++)
|
|
|
|
token_destroy(thelexer->tokens[i]);
|
2024-10-02 17:57:04 -04:00
|
|
|
}
|
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_lex() {
|
|
|
|
while (*thelexer->cchar) {
|
|
|
|
switch (thelexer->state) {
|
|
|
|
case LEXER_STATE_CONFUSED: lexer_do_confused(); break;
|
|
|
|
case LEXER_STATE_NUM: lexer_do_number(); break;
|
|
|
|
case LEXER_STATE_CALL: lexer_do_call(); break;
|
2024-10-02 21:04:54 -04:00
|
|
|
default: break;
|
2024-10-02 17:57:04 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-10-05 09:24:12 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_do_confused() {
|
|
|
|
log_dbgf("lexer @ %p entered confused mode @ char '%c' (%d)", thelexer,
|
|
|
|
*thelexer->cchar, (int)*thelexer->cchar);
|
2024-10-26 10:07:33 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
if (isspace(*thelexer->cchar)) lexer_inc();
|
2024-10-13 23:46:03 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
if (isdigit(*thelexer->cchar)) {
|
|
|
|
thelexer->state = LEXER_STATE_NUM;
|
|
|
|
lexer_do_number();
|
2024-10-19 09:09:37 -04:00
|
|
|
} else {
|
2024-11-07 19:41:14 -05:00
|
|
|
thelexer->state = LEXER_STATE_CALL;
|
|
|
|
lexer_do_call();
|
2024-10-19 09:09:37 -04:00
|
|
|
}
|
2024-10-07 11:48:53 -04:00
|
|
|
}
|
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_do_number() {
|
|
|
|
log_dbgf("lexer @ %p entered number mode @ char '%c' (%d)", thelexer,
|
|
|
|
*thelexer->cchar, (int)*thelexer->cchar);
|
2024-10-13 23:46:03 -04:00
|
|
|
|
2024-10-19 10:59:05 -04:00
|
|
|
// Length of the number string.
|
|
|
|
size_t numln;
|
2024-10-07 11:48:53 -04:00
|
|
|
|
|
|
|
// Where the number string starts.
|
2024-11-07 19:41:14 -05:00
|
|
|
char* start = thelexer->cchar;
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
for (numln = 0; *thelexer->cchar && isdigit(*thelexer->cchar); numln++)
|
|
|
|
lexer_inc();
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-10-19 10:59:05 -04:00
|
|
|
char* num = malloc(numln + 1);
|
|
|
|
memcpy(num, start, numln);
|
|
|
|
num[numln] = '\0';
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
lexer_add_token(token_init(TOKEN_TYPE_NUMBER, num, numln));
|
|
|
|
thelexer->state = LEXER_STATE_CONFUSED;
|
2024-10-07 11:48:53 -04:00
|
|
|
}
|
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_do_call() {
|
|
|
|
log_dbgf("lexer @ %p entered call mode @ char '%c' (%d)", thelexer,
|
|
|
|
*thelexer->cchar, (int)*thelexer->cchar);
|
2024-10-13 23:46:03 -04:00
|
|
|
|
2024-10-07 11:48:53 -04:00
|
|
|
// Size of the call string.
|
2024-10-19 10:59:05 -04:00
|
|
|
size_t callln;
|
2024-10-07 11:48:53 -04:00
|
|
|
|
|
|
|
// Where the call string starts.
|
2024-11-07 19:41:14 -05:00
|
|
|
char* start = thelexer->cchar;
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-11-09 04:37:56 -05:00
|
|
|
for (callln = 0; *thelexer->cchar &&
|
|
|
|
(!isdigit(*thelexer->cchar) && !isspace(*thelexer->cchar));
|
2024-10-26 10:07:33 -04:00
|
|
|
callln++)
|
2024-11-07 19:41:14 -05:00
|
|
|
lexer_inc();
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-10-19 10:59:05 -04:00
|
|
|
char* call = malloc(callln + 1);
|
|
|
|
memcpy(call, start, callln);
|
|
|
|
call[callln] = '\0';
|
2024-10-10 16:09:25 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
lexer_add_token(token_init(TOKEN_TYPE_CALL, call, callln));
|
2024-10-19 09:09:37 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
thelexer->state = LEXER_STATE_CONFUSED;
|
2024-10-07 11:48:53 -04:00
|
|
|
}
|
|
|
|
|
2024-11-09 04:37:56 -05:00
|
|
|
void lexer_inc() { thelexer->cchar += sizeof(char); }
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_add_token(Token* token) {
|
|
|
|
assert(thelexer->ntokens < TOKENS_MAX);
|
2024-10-07 11:48:53 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
if (thelexer->ntokens < TOKENS_MAX - 1) {
|
|
|
|
thelexer->tokens[thelexer->ntokens] = token;
|
|
|
|
thelexer->ntokens++;
|
2024-10-19 10:59:05 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
log_dbgf("added token (total: %ld)", thelexer->ntokens);
|
2024-10-07 11:48:53 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_print() { lexer_print_i(0); }
|
2024-10-19 10:59:05 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexer_print_i(int ilvl) {
|
2024-10-25 11:20:07 -04:00
|
|
|
INDENT_BEGIN(ilvl);
|
2024-11-07 19:41:14 -05:00
|
|
|
INDENT_TITLE("Lexer", thelexer);
|
2024-10-31 16:05:04 -04:00
|
|
|
INDENT_FIELD_NONL_START("state")
|
2024-11-09 04:37:56 -05:00
|
|
|
lexerstate_print_raw();
|
2024-10-31 16:05:04 -04:00
|
|
|
INDENT_FIELD_NONL_END
|
2024-11-07 19:41:14 -05:00
|
|
|
INDENT_FIELD("srcln", "%ld", thelexer->srcln);
|
|
|
|
INDENT_FIELD_NL("src", "\"%s\"", thelexer->src);
|
|
|
|
INDENT_FIELD("cchar", "'%c'", *thelexer->cchar);
|
|
|
|
INDENT_FIELD("ntokens", "%ld", thelexer->ntokens);
|
2024-11-09 04:37:56 -05:00
|
|
|
INDENT_FIELD_LIST("tokens", thelexer->tokens, thelexer->ntokens,
|
|
|
|
token_print_i);
|
2024-10-19 10:59:05 -04:00
|
|
|
}
|
2024-10-19 09:09:37 -04:00
|
|
|
|
2024-11-07 19:41:14 -05:00
|
|
|
void lexerstate_print_raw() {
|
|
|
|
LexerState s = thelexer->state;
|
2024-10-31 16:05:04 -04:00
|
|
|
if (s > LEXER_STATE_MAX) {
|
|
|
|
printf("Unknown (%d)", s);
|
|
|
|
log_dbgf("%d is not a valid LexerState (max: %d)", s, TOKEN_TYPE_MAX);
|
|
|
|
} else printf("%s", lexerstate_names[s]);
|
2024-10-19 09:09:37 -04:00
|
|
|
}
|
2024-11-02 10:31:55 -04:00
|
|
|
|
2024-11-09 10:27:03 -05:00
|
|
|
int yylex() {
|
2024-11-02 11:02:18 -04:00
|
|
|
if (*thelexer->cchar == '\0') return YYEOF;
|
|
|
|
|
2024-11-09 10:27:03 -05:00
|
|
|
// Skip all whitespace.
|
|
|
|
while (*thelexer->cchar == ' ' || *thelexer->cchar == '\t')
|
|
|
|
thelexer->cchar++;
|
2024-11-02 11:02:18 -04:00
|
|
|
|
|
|
|
// Assign & consume current character.
|
|
|
|
int c = *thelexer->cchar++;
|
|
|
|
|
2024-11-09 10:27:03 -05:00
|
|
|
// Check for NUM.
|
2024-11-02 11:02:18 -04:00
|
|
|
if (isdigit(c)) {
|
2024-11-09 10:27:03 -05:00
|
|
|
int value = c - '0';
|
2024-11-02 11:02:18 -04:00
|
|
|
while (isdigit(*thelexer->cchar)) {
|
2024-11-09 10:27:03 -05:00
|
|
|
value = value * 10 + (*thelexer->cchar - '0'); // Accumulate value.
|
|
|
|
thelexer->cchar++;
|
2024-11-02 11:02:18 -04:00
|
|
|
}
|
2024-11-09 10:27:03 -05:00
|
|
|
yylval.intval = value; // Set the token value.
|
|
|
|
return NUM;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (c) {
|
|
|
|
case '+': return PLUS;
|
|
|
|
default: return CALL;
|
2024-11-02 11:02:18 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(stderr, "Unexpected character: %c\n", c);
|
|
|
|
|
|
|
|
return 0;
|
2024-11-02 10:31:55 -04:00
|
|
|
}
|
2024-11-09 11:09:57 -05:00
|
|
|
void yyerror(char const* s) { fprintf(stderr, "%s\n", s); }
|