commit 67caaf3025f80fef4af72cba3dac2dfb0dcf13af
parent 039616ce0bc89c6ee3bf968e32d2a707ed942c86
Author: Mario Rosell R. Martinez <mario@mariorosell.es>
Date: Sat, 4 Apr 2026 11:27:16 +0200
lex: add simple lexer
Works using a simple DFA and tables. I still have to implement the pipeline,
though.
This lexer is still just a prototype ok? Don't take it as "final" or something.
Diffstat:
| M | Makefile | | | 2 | +- |
| A | lex.c | | | 241 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lex.h | | | 71 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 313 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
# Makefile -- build system
#
-SRCS := slash.c cli.c stat.c
+SRCS := slash.c cli.c stat.c lex.c
OBJS := ${SRCS:.c=.o}
CC ?= cc
diff --git a/lex.c b/lex.c
@@ -0,0 +1,241 @@
+/*
+ * lex.c -- simple enough dfa-based parser
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+enum toktype {
+ TOK_EOF,
+
+ TOK_CLASS,
+ TOK_FC,
+ TOK_USE,
+ TOK_AS,
+
+ TOK_F32,
+ TOK_F64,
+ TOK_I8,
+ TOK_I16,
+ TOK_I32,
+ TOK_I64,
+ TOK_U8,
+ TOK_U16,
+ TOK_U32,
+ TOK_U64,
+
+ TOK_IDENT,
+
+ TOK_DOT,
+ TOK_LBRACE,
+ TOK_RBRACE,
+ TOK_LPAREN,
+ TOK_RPAREN,
+ TOK_COMMA,
+ TOK_SEMI,
+
+ TOK_UNKNOWN
+};
+
+typedef enum toktype toktype_t;
+
+typedef struct tok {
+ toktype_t type;
+ char lexeme[64];
+} tok_t;
+
+typedef struct {
+ const char *name;
+ toktype_t type;
+} keyword_t;
+
+static const keyword_t kws[] = {
+ {"use", TOK_USE},
+ {"class", TOK_CLASS},
+ {"fc", TOK_FC},
+ {"as", TOK_AS},
+
+ {"f32", TOK_F32},
+ {"f64", TOK_F64},
+
+ {"i8", TOK_I8},
+ {"i16", TOK_I16},
+ {"i32", TOK_I32},
+ {"i64", TOK_I64},
+
+ {"u8", TOK_U8},
+ {"u16", TOK_U16},
+ {"u32", TOK_U32},
+ {"u64", TOK_U64},
+};
+
+static const int nkeyws = sizeof(kws) / sizeof(kws[0]);
+
+toktype_t
+lu_kw(const char *str)
+{
+ int i;
+
+ i = 0;
+ while (i < nkeyws) {
+ if (strcmp(str, kws[i].name) == 0)
+ return kws[i].type;
+ i++;
+ }
+ return TOK_IDENT;
+}
+
+/* state */
+const char *src;
+size_t i = 0;
+
+char
+peek(void)
+{
+ return src[i];
+}
+
+char
+consume(void)
+{
+ return src[i++];
+}
+
+void
+s_ws(void)
+{
+ while (isspace(peek()))
+ consume();
+}
+
+void
+s_lc(void)
+{
+ while (peek() && peek() != '\n')
+ consume();
+}
+
+void
+s_bc(void)
+{
+ consume();
+ consume();
+
+ while (peek()) {
+ if (peek() == '*' && src[i + 1] == '/') {
+ consume();
+ consume();
+ break;
+ }
+ consume();
+ }
+}
+
+tok_t
+id(void)
+{
+ tok_t t;
+ int len;
+
+ len = 0;
+
+ while (isalnum(peek()) || peek() == '_') {
+ if (len < 63)
+ t.lexeme[len++] = consume();
+ else
+ consume();
+ }
+
+ t.lexeme[len] = '\0';
+ t.type = lu_kw(t.lexeme);
+ return t;
+}
+
+tok_t
+nexttok(void)
+{
+ tok_t t;
+ char c;
+
+ s_ws();
+
+ t.lexeme[0] = '\0';
+ c = peek();
+
+ if (c == '\0') {
+ t.type = TOK_EOF;
+ return t;
+ }
+
+ if (c == '/') {
+ consume();
+ if (peek() == '/') {
+ consume();
+ s_lc();
+ return nexttok();
+ }
+ if (peek() == '*') {
+ consume();
+ s_bc();
+ return nexttok();
+ }
+ t.type = TOK_UNKNOWN;
+ return t;
+ }
+
+ if (isalpha(c) || c == '_')
+ return id();
+
+ consume();
+
+ if (c == '{')
+ t.type = TOK_LBRACE;
+ else if (c == '}')
+ t.type = TOK_RBRACE;
+ else if (c == '(')
+ t.type = TOK_LPAREN;
+ else if (c == ')')
+ t.type = TOK_RPAREN;
+ else if (c == ',')
+ t.type = TOK_COMMA;
+ else if (c == ';')
+ t.type = TOK_SEMI;
+ else if (c == '.')
+ t.type = TOK_DOT;
+ else
+ t.type = TOK_UNKNOWN;
+
+ return t;
+}
+
+const char *
+tokname(toktype_t t)
+{
+ switch (t) {
+ case TOK_CLASS: return "CLASS";
+ case TOK_FC: return "FC";
+ case TOK_AS: return "AS";
+
+ case TOK_F32: return "F32";
+ case TOK_F64: return "F64";
+
+ case TOK_I16: return "I16";
+ case TOK_I32: return "I32";
+
+ case TOK_U16: return "U16";
+ case TOK_U64: return "U64";
+
+ case TOK_IDENT: return "IDENT";
+ case TOK_DOT: return "DOT";
+ case TOK_LBRACE: return "LBRACE";
+ case TOK_RBRACE: return "RBRACE";
+ case TOK_LPAREN: return "LPAREN";
+ case TOK_RPAREN: return "RPAREN";
+ case TOK_COMMA: return "COMMA";
+ case TOK_SEMI: return "SEMI";
+ case TOK_EOF: return "EOF";
+ default: return "UNKNOWN";
+ }
+}
+
diff --git a/lex.h b/lex.h
@@ -0,0 +1,71 @@
+/*
+ * lex.h -- lexer interface
+ */
+
+#ifndef LEX_H
+#define LEX_H
+
+#include <stddef.h>
+
+typedef enum toktype {
+ TOK_EOF,
+
+ TOK_CLASS,
+ TOK_FC,
+ TOK_USE,
+ TOK_AS,
+
+ TOK_F32,
+ TOK_F64,
+ TOK_I8,
+ TOK_I16,
+ TOK_I32,
+ TOK_I64,
+ TOK_U8,
+ TOK_U16,
+ TOK_U32,
+ TOK_U64,
+
+ TOK_IDENT,
+
+ TOK_DOT,
+ TOK_LBRACE,
+ TOK_RBRACE,
+ TOK_LPAREN,
+ TOK_RPAREN,
+ TOK_COMMA,
+ TOK_SEMI,
+
+ TOK_UNKNOWN
+} toktype_t;
+
+typedef struct tok {
+ toktype_t type;
+ char lexeme[64];
+} tok_t;
+
+typedef struct {
+ const char *name;
+ toktype_t type;
+} keyword_t;
+
+/* state defined in lex.c */
+extern const char *src;
+extern size_t i;
+
+toktype_t lu_kw(const char *str);
+
+char peek(void);
+char consume(void);
+
+void s_ws(void);
+void s_lc(void);
+void s_bc(void);
+
+tok_t id(void);
+tok_t nexttok(void);
+
+const char *tokname(toktype_t t);
+
+#endif /* LEX_H */
+