lex: add simple lexer - slash - slash is a simple type-oriented programming language

commit 67caaf3025f80fef4af72cba3dac2dfb0dcf13af
parent 039616ce0bc89c6ee3bf968e32d2a707ed942c86
Author: Mario Rosell R. Martinez <mario@mariorosell.es>
Date:   Sat,  4 Apr 2026 11:27:16 +0200

lex: add simple lexer

Works using a simple DFA and tables. I still have to implement the pipeline,
though.

This lexer is still just a prototype ok? Don't take it as "final" or something.

Diffstat:
M Makefile  | 2 +-
A lex.c  | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lex.h  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 313 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 # Makefile -- build system
 #
 
-SRCS     := slash.c cli.c stat.c
+SRCS     := slash.c cli.c stat.c lex.c
 OBJS     := ${SRCS:.c=.o}
 
 CC       ?= cc
diff --git a/lex.c b/lex.c
@@ -0,0 +1,241 @@
+/*
+ * lex.c -- simple enough dfa-based parser
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+enum toktype {
+	TOK_EOF,
+
+	TOK_CLASS,
+	TOK_FC,
+	TOK_USE,
+	TOK_AS,
+
+	TOK_F32,
+	TOK_F64,
+	TOK_I8,
+	TOK_I16,
+	TOK_I32,
+	TOK_I64,
+	TOK_U8,
+	TOK_U16,
+	TOK_U32,
+	TOK_U64,
+
+	TOK_IDENT,
+
+	TOK_DOT,
+	TOK_LBRACE,
+	TOK_RBRACE,
+	TOK_LPAREN,
+	TOK_RPAREN,
+	TOK_COMMA,
+	TOK_SEMI,
+
+	TOK_UNKNOWN
+};
+
+typedef enum toktype toktype_t;
+
+typedef struct tok {
+	toktype_t	type;
+	char		lexeme[64];
+} tok_t;
+
+typedef struct {
+	const char	*name;
+	toktype_t	type;
+} keyword_t;
+
+static const keyword_t kws[] = {
+	{"use",		TOK_USE},
+	{"class",	TOK_CLASS},
+	{"fc",		TOK_FC},
+	{"as",		TOK_AS},
+
+	{"f32",		TOK_F32},
+	{"f64",		TOK_F64},
+
+	{"i8",		TOK_I8},
+	{"i16",		TOK_I16},
+	{"i32",		TOK_I32},
+	{"i64",		TOK_I64},
+
+	{"u8",		TOK_U8},
+	{"u16",		TOK_U16},
+	{"u32",		TOK_U32},
+	{"u64",		TOK_U64},
+};
+
+static const int	nkeyws = sizeof(kws) / sizeof(kws[0]);
+
+toktype_t
+lu_kw(const char *str)
+{
+	int	i;
+
+	i = 0;
+	while (i < nkeyws) {
+		if (strcmp(str, kws[i].name) == 0)
+			return kws[i].type;
+		i++;
+	}
+	return TOK_IDENT;
+}
+
+/* state */
+const char	*src;
+size_t		i = 0;
+
+char
+peek(void)
+{
+	return src[i];
+}
+
+char
+consume(void)
+{
+	return src[i++];
+}
+
+void
+s_ws(void)
+{
+	while (isspace(peek()))
+		consume();
+}
+
+void
+s_lc(void)
+{
+	while (peek() && peek() != '\n')
+		consume();
+}
+
+void
+s_bc(void)
+{
+	consume();
+	consume();
+
+	while (peek()) {
+		if (peek() == '*' && src[i + 1] == '/') {
+			consume();
+			consume();
+			break;
+		}
+		consume();
+	}
+}
+
+tok_t
+id(void)
+{
+	tok_t	t;
+	int	len;
+
+	len = 0;
+
+	while (isalnum(peek()) || peek() == '_') {
+		if (len < 63)
+			t.lexeme[len++] = consume();
+		else
+			consume();
+	}
+
+	t.lexeme[len] = '\0';
+	t.type = lu_kw(t.lexeme);
+	return t;
+}
+
+tok_t
+nexttok(void)
+{
+	tok_t	t;
+	char	c;
+
+	s_ws();
+
+	t.lexeme[0] = '\0';
+	c = peek();
+
+	if (c == '\0') {
+		t.type = TOK_EOF;
+		return t;
+	}
+
+	if (c == '/') {
+		consume();
+		if (peek() == '/') {
+			consume();
+			s_lc();
+			return nexttok();
+		}
+		if (peek() == '*') {
+			consume();
+			s_bc();
+			return nexttok();
+		}
+		t.type = TOK_UNKNOWN;
+		return t;
+	}
+
+	if (isalpha(c) || c == '_')
+		return id();
+
+	consume();
+
+	if (c == '{')
+		t.type = TOK_LBRACE;
+	else if (c == '}')
+		t.type = TOK_RBRACE;
+	else if (c == '(')
+		t.type = TOK_LPAREN;
+	else if (c == ')')
+		t.type = TOK_RPAREN;
+	else if (c == ',')
+		t.type = TOK_COMMA;
+	else if (c == ';')
+		t.type = TOK_SEMI;
+	else if (c == '.')
+		t.type = TOK_DOT;
+	else
+		t.type = TOK_UNKNOWN;
+
+	return t;
+}
+
+const char *
+tokname(toktype_t t)
+{
+	switch (t) {
+	case TOK_CLASS:		return "CLASS";
+	case TOK_FC:		return "FC";
+	case TOK_AS:		return "AS";
+
+	case TOK_F32:		return "F32";
+	case TOK_F64:		return "F64";
+
+	case TOK_I16:		return "I16";
+	case TOK_I32:		return "I32";
+
+	case TOK_U16:		return "U16";
+	case TOK_U64:		return "U64";
+
+	case TOK_IDENT:	return "IDENT";
+	case TOK_DOT:		return "DOT";
+	case TOK_LBRACE:	return "LBRACE";
+	case TOK_RBRACE:	return "RBRACE";
+	case TOK_LPAREN:	return "LPAREN";
+	case TOK_RPAREN:	return "RPAREN";
+	case TOK_COMMA:		return "COMMA";
+	case TOK_SEMI:		return "SEMI";
+	case TOK_EOF:		return "EOF";
+	default:		return "UNKNOWN";
+	}
+}
+
diff --git a/lex.h b/lex.h
@@ -0,0 +1,71 @@
+/*
+ * lex.h -- lexer interface
+ */
+
+#ifndef LEX_H
+#define LEX_H
+
+#include <stddef.h>
+
+typedef enum toktype {
+	TOK_EOF,
+
+	TOK_CLASS,
+	TOK_FC,
+	TOK_USE,
+	TOK_AS,
+
+	TOK_F32,
+	TOK_F64,
+	TOK_I8,
+	TOK_I16,
+	TOK_I32,
+	TOK_I64,
+	TOK_U8,
+	TOK_U16,
+	TOK_U32,
+	TOK_U64,
+
+	TOK_IDENT,
+
+	TOK_DOT,
+	TOK_LBRACE,
+	TOK_RBRACE,
+	TOK_LPAREN,
+	TOK_RPAREN,
+	TOK_COMMA,
+	TOK_SEMI,
+
+	TOK_UNKNOWN
+} toktype_t;
+
+typedef struct tok {
+	toktype_t	type;
+	char		lexeme[64];
+} tok_t;
+
+typedef struct {
+	const char	*name;
+	toktype_t	type;
+} keyword_t;
+
+/* state defined in lex.c */
+extern const char	*src;
+extern size_t		i;
+
+toktype_t	lu_kw(const char *str);
+
+char		peek(void);
+char		consume(void);
+
+void		s_ws(void);
+void		s_lc(void);
+void		s_bc(void);
+
+tok_t		id(void);
+tok_t		nexttok(void);
+
+const char	*tokname(toktype_t t);
+
+#endif /* LEX_H */
+

	slash slash is a simple type-oriented programming language
	Log \| Files \| Refs \| README \| LICENSE

M	Makefile	\|	2	+-
A	lex.c	\|	241	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lex.h	\|	71	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++