s502 assembler
A very simple assembler for the 6502 line of processors written in C
tokenFunc.c
Go to the documentation of this file.
1 #include "debugmalloc.h"
2 
3 #include <stdio.h>
4 #include <string.h>
5 #include <stdlib.h>
6 
7 #include "tokenFunc.h"
8 #include "util.h"
9 #include "number.h"
10 #include "state.h"
11 #include "logging.h"
12 #include "directive.h"
13 
20 void token_print(Token* token) {
21  printf("\t%s:%d:%d\t\t'%.*s'\n", token->source.fname, token->source.lineno, token->len, token->len, token->stripped);
22 }
23 
31  token->instr.inst = instruction_find(s->instr, token->stripped);
32  if (token->instr.inst == NULL) return -1;
33  return 0;
34 }
35 
36 /*
37 Addressing modes
38 
39 The 6502 supports 13 addressing modes:
40 
41 Accumulator OPC A
42 
43 Absolute OPC $AABB
44 Absolute, X OPC $AABB, X
45 Absolute, Y OPC $AABB, Y
46 
47 Immidiate OPC #$BB
48 Implied OPC
49 
50 Indirect OPC ($AABB)
51 Indirect, X OPC ($BB, X)
52 Indirect, Y OCP ($BB), Y
53 
54 Relative OPC $BB
55 
56 Zeropage OPC *$BB
57 Zeropage, X OPC *$BB, X
58 Zeropage, Y OPC *$BB, Y
59 
60 There are a few caviats though
61 
62 First, relative indexing (which is used with jumps) should actually take a 16 bit address, and the assembler should take the difference or return an error, but it should NOT support setting the operand by hand
63 In either case, just by its form, relative and absolute is not easily distinguishable
64 
65 BUT!
66 There is no instruction with a relative A-mode and also any other mode, so if the instruction CAN be rel, it MUST be it
67 
68 Also, in the place of any number, there can also be a label or a define location
69 
70 For these reasons, addr mode identification works the following way:
71 
72 1) Token ends after OPC (len=3) => Implied
73 
74 2) Token len is 5,
75 ends with 'A' or 'a' => Accumulator
76 
77 3) 4th char is a # => Immidiate
78 
79 4) Can it be relative?
80 (based on the mnemonic) => Relative
81 
82 (we now got rid of most simple cases)
83 
84 5) 4th char is a *
85 - count X, Y and , chars
86 - validate combinations => Zeropage and variants
87 
88 6) 4th char is a (
89 - count X, Y, comma and )
90 - validate => Indirect and variants
91 
92 7) It's some kind of absolute
93 - count X, Y and comma
94 - validate => Absolute and variants
95 
96 
97 This _should_ work, but is not perfect. It should be possible to refactor this into a state machine and clean it up this way.
98 
99 */
100 
110 
111  // step 1 - implied
112  if (t->len == 3) {
113  t->instr.addressmode = ADRM_IMP;
114  return 0;
115  }
116 
117 
118  // step 2 - acc
119  if (t->len == 5 && util_match_char(t->stripped[4], 'a')) {
120  t->instr.addressmode = ADRM_ACC;
121  return 0;
122  }
123 
124 
125  // step 3 - imm
126  if (t->stripped[4] == '#') {
127  t->instr.addressmode = ADRM_IMM;
128  return 0;
129  }
130 
131 
132  // step 4 - relative
133  if (t->instr.inst->opcs[ADRM_REL] != OPC_INVALID) {
134  // this can ONLY be a relative
135  t->instr.addressmode = ADRM_REL;
136  return 0;
137  }
138 
139 
140 
141  // step 5,6,7 have a few checks in common
142  // so we can avoid code duplication by fatoring that out
143 
144  int s_x = 0, s_y = 0, s_close = 0, s_sep = 0; // seen x, y, close or coma
145 
146  for (int i = 0; i < t->len; i++) {
147  if (util_match_char(t->stripped[i], 'x') && s_sep)
148  s_x = 1;
149  if (util_match_char(t->stripped[i], 'y') && s_sep)
150  s_y = 1;
151  if (t->stripped[i] == ',')
152  s_sep = 1;
153  if (t->stripped[i] == ')')
154  s_close = 1;
155  }
156 
157  // must have a separator if it has an index
158  if ((s_x || s_y) && !s_sep)
159  return -1;
160 
161  // can not have both indexes
162  if (s_x && s_y)
163  return -1;
164 
165  // end of common chekcs
166 
167 
168  // step 5 - zeropage
169  if (t->stripped[4] == '*') {
170  // must NOT have a close
171  if (s_close)
172  return -1;
173 
174  // no index - normal zpg
175  if (!(s_x || s_y)) {
176  t->instr.addressmode = ADRM_ZPG;
177  return 0;
178  }
179 
180  // Set according to index
181  t->instr.addressmode = s_x ? ADRM_ZPG_X : ADRM_ZPG_Y;
182  return 0;
183  }
184 
185  // step 6 - indirect
186  if (t->stripped[4] == '(') {
187  // must have a close )
188  if (!s_close)
189  return -1;
190 
191  // no index - normal inderect
192  if (!(s_x || s_y)) {
193  t->instr.addressmode = ADRM_IND;
194  return 0;
195  }
196 
197  // Set according to index
198  t->instr.addressmode = s_x ? ADRM_IND_X : ADRM_IND_Y;
199  return 0;
200  }
201 
202 
203  // step 7 - absolute
204  {
205  // must not have a close )
206  if (s_close)
207  return -1;
208 
209  // no index - normal absolute
210  if (!(s_x || s_y)) {
211  t->instr.addressmode = ADRM_ABS;
212  return 0;
213  }
214 
215  // Set according to index
216  t->instr.addressmode = s_x ? ADRM_ABS_X : ADRM_ABS_Y;
217  return 0;
218  }
219 
220  return -1;
221 }
222 
223 
225  if (token_link_instruction(s, t) < 0) {
226  ERROR("Unknown instruction!\n");
227  goto ERR;
228  }
229  if (token_get_addressmode(t) < 0) {
230  ERROR("Can not determine instruction address mode!\n");
231  goto ERR;
232  }
233  if (t->instr.inst->opcs[t->instr.addressmode] == OPC_INVALID) {
234  ERROR("Invalid instruction-addressmode combination!\n");
235  ERROR("A-mode: %s\n", ADRM_NAMES[t->instr.addressmode]);
236  goto ERR;
237  }
238 
239  t->binSize = 1 + ADRM_SIZES[t->instr.addressmode];
240 
241  return 0;
242 
243 
244 ERR:
245  token_print(t);
246  FAIL("Error while analyzing token: \n");
247  return -1;
248 }
249 
250 
252  // how many token types does it fit
253  int found = 0;
254 
255  // directive - starts with a dot
256  if (t->stripped[0] == '.') {
257  t->type = TT_DIRECTIVE;
258  found++;
259  }
260 
261  // label - ends with a ':'
262  if (t->stripped[t->len - 1] == ':') {
263  t->type = TT_LABEL;
264  found++;
265  }
266 
267  // instruction - 3rd char is a space or len is 3
268  if (t->stripped[3] == ' ' || t->stripped[3] == '\0') {
269  t->type = TT_INSTR;
270  t->instr.number = -1;
271  found++;
272  }
273 
274  // 0 or more than one match is a problem
275  if (found != 1) {
276  ERROR("Can not recognize token:\n");
277  token_print(t);
278  return -1;
279  }
280 
281  LOG(4, "Recognized token as %d:\n", t->type);
282  LOGDO(4, token_print(t));
283 
284  return 0;
285 }
286 
288  if (t->type != TT_INSTR) return 0;
289  if (t->binSize == 1) {
290  t->instr.number = 0;
291  return 0;
292  }
293 
294  char* begin = &t->stripped[4];
295  char* end;
296  for (; *begin != 0 && (*begin == ' ' || *begin == '*' || *begin == '(' || *begin == '#'); begin++);
297  for (end = begin; *end != 0 && *end != ')' && *end != ',' && *end != ' '; end++);
298 
299  char* buff = malloc(end - begin + 1);
300  strncpy(buff, begin, end - begin + 1);
301  buff[end - begin] = 0;
302 
303  int n = number_get_number(s, buff);
304  free(buff);
305 
306  if (n == NUMBER_ERROR) {
307  token_print(t);
308  FAIL("Opcode operand parsing failed!\n");
309  return -1;
310  }
311  if (n == NUMBER_LABEL_NODEF) {
312  return 0;
313  }
314  t->instr.number = n;
315  return 0;
316 }
317 
318 int token_compile(State* s, Token* t, char** dataptr) {
319  if (t->type != TT_INSTR) {
320  if (t->type == TT_DIRECTIVE) {
321  return directive_compile(s, t, dataptr);
322  }
323  // should NEVER reach this:
324  ERROR("Something went terribly wrong!\n");
325  ERROR("A LABEL in last pass!\n");
326  return -1;
327  }
328  int size = t->binSize;
329  char* data = malloc(size);
330  *dataptr = data;
331  if (t->instr.addressmode == ADRM_REL) {
332  int n = t->instr.number - t->instr.address - 2;
333 
334  if (-128 > n || 127 < n) {
335  ERROR("Relative addressing jump too far!\n");
336  printf("Target: $%x, from: $%x (diff: $%x)\n", t->instr.number, t->instr.address, n);
337  token_print(t);
338  free(data);
339  *dataptr = NULL;
340  return -1;
341  }
342  t->instr.number = n;
343  }
344  data[0] = t->instr.inst->opcs[t->instr.addressmode];
345  if (size > 1) {
346  data[1] = t->instr.number & 0xff;
347  }
348  if (size > 2) {
349  data[2] = (t->instr.number >> 8) & 0xff;
350  }
351  return 0;
352 }
ADRM_ZPG_X
@ ADRM_ZPG_X
Zeropage, X indexed.
Definition: addressmode.h:45
TT_INSTR
@ TT_INSTR
instruction token
Definition: token_t.h:26
Token::instr
struct Token::@1 instr
instruction data. Not used if token type is not TT_INSTR
ADRM_NAMES
const char * ADRM_NAMES[]
Human-readable names of address modes.
Definition: addressmode.c:16
Token::stripped
char stripped[TOKEN_BUFFER_SIZE]
stripped text from source file
Definition: token_t.h:55
ADRM_REL
@ ADRM_REL
Relative (8bit, signed 2's complement)
Definition: addressmode.h:39
Token::token_link_instruction
int token_link_instruction(State *s, Token *token)
find and link the instruction entry for a token
Definition: tokenFunc.c:30
Token::token_get_addressmode
int token_get_addressmode(Token *t)
Determine the address mode of a token.
Definition: tokenFunc.c:109
ADRM_ZPG
@ ADRM_ZPG
Zeropage.
Definition: addressmode.h:42
ADRM_IND_X
@ ADRM_IND_X
Indirect (8 bit), X indexed.
Definition: addressmode.h:33
ADRM_IND
@ ADRM_IND
Indirect (16 bit)
Definition: addressmode.h:30
State::instr
Instruction * instr
instruction data
Definition: state.h:40
LOG
#define LOG(LVL,...)
logging macro - works like printf
Definition: logging.h:28
State
Compiler pseudo-global state.
Definition: state.h:32
LOGDO
#define LOGDO(LVL, x)
Conditional macro. Wraps contents into a conditional based on log level.
Definition: logging.h:35
ADRM_SIZES
int ADRM_SIZES[ADRM_COUNT+1]
operand sizes of addressmodes
Definition: addressmode.c:37
Token::token_analyze_instruction
int token_analyze_instruction(State *s, Token *t)
analyze instruction token (instruction, addressmode & operand)
Definition: tokenFunc.c:224
ADRM_ZPG_Y
@ ADRM_ZPG_Y
Zeropage, Y indexed.
Definition: addressmode.h:48
directive.h
step 1 and 3 processing for directive tokens
util_match_char
int util_match_char(char a, char b)
Case-insensitive character compare.
Definition: util.c:21
Token
Token type to store token information.
Definition: token_t.h:37
ADRM_ACC
@ ADRM_ACC
Accumulator.
Definition: addressmode.h:12
Token::binSize
int binSize
number of bytes this token will generate
Definition: token_t.h:39
TT_DIRECTIVE
@ TT_DIRECTIVE
directive token
Definition: token_t.h:28
tokenFunc.h
Token type member methods.
Token::type
enum tokenType type
type of this token
Definition: token_t.h:41
Token::token_print
void token_print(Token *token)
Pretty-print one token, with its source and length.
Definition: tokenFunc.c:20
directive_compile
int directive_compile(State *s, Token *t, char **dataptr)
Compile a directive into binary data.
Definition: directive.c:498
ADRM_IMM
@ ADRM_IMM
Immidiate.
Definition: addressmode.h:24
TT_LABEL
@ TT_LABEL
label token
Definition: token_t.h:30
Token::token_recognize
int token_recognize(Token *t)
Parse token - test if it's an opcode, a label or a directive.
Definition: tokenFunc.c:251
ADRM_IND_Y
@ ADRM_IND_Y
Indirect (8 bit), Y indexed.
Definition: addressmode.h:36
Token::token_compile
int token_compile(State *s, Token *t, char **dataptr)
compile token into binary data
Definition: tokenFunc.c:318
logging.h
logging and fancy-printing
Token::source
struct Token::@2 source
source of this token
Token::len
int len
length of stripped text
Definition: token_t.h:57
Token::token_get_operand
int token_get_operand(State *s, Token *t)
parse the operand of the instruction as a number
Definition: tokenFunc.c:287
ADRM_ABS_Y
@ ADRM_ABS_Y
Absolute, Y indexed.
Definition: addressmode.h:21
state.h
implement State class
Instruction::instruction_find
Instruction * instruction_find(Instruction *list, char *mnem)
find the Instruction entry for a given mnemonic
Definition: instructions.c:111
ADRM_ABS_X
@ ADRM_ABS_X
Absolute, X indexed.
Definition: addressmode.h:18
NUMBER_ERROR
@ NUMBER_ERROR
Could not parse a number or constant is undefined.
Definition: number.h:15
number.h
Number module to parse numbers.
util.h
various utility functions
ADRM_IMP
@ ADRM_IMP
Implied (no operand)
Definition: addressmode.h:27
number_get_number
int number_get_number(State *s, char *str)
interpret a string as a constant, label or number
Definition: number.c:75
ADRM_ABS
@ ADRM_ABS
Absolute addressing.
Definition: addressmode.h:15
OPC_INVALID
@ OPC_INVALID
An invalid opcode to signal invalid / non-existent variations.
Definition: instructions.h:15
FAIL
#define FAIL(...)
Fancy-print a fail (failed step). Works like printf.
Definition: logging.h:45
ERROR
#define ERROR(...)
Fancy-print an error (cause of faliure). Works like printf.
Definition: logging.h:40
NUMBER_LABEL_NODEF
@ NUMBER_LABEL_NODEF
Undefined label.
Definition: number.h:17