Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <ctype.h>
- #include <stdarg.h>
- // ----- FLOAT CHECKER -----
- enum {
- FStart, // starting state
- FSign, // + or - at the state
- FInteger, // the integer amount
- FPoint, // the decimal point
- FFraction, // the fractional amount
- FExpMark, // the E before an exponent amount
- FExpSign, // + or - before the exponent amount
- FExpInteger, // exponent amount
- FError, // error
- };
- struct float_state {
- int plus_minus, is_digit, decimal_point, e, valid;
- } float_state_machine[] = {
- // +/- 0-9 . E valid
- {FSign, FInteger, FError, FError, 0}, // Start
- {FError, FInteger, FError, FError, 0}, // Sign
- {FError, FInteger, FPoint, FExpMark, 2}, // Integer
- {FError, FFraction, FError, FError, 0}, // Point
- {FError, FFraction, FError, FExpMark, 1}, // Fraction
- {FExpSign, FExpInteger, FError, FError, 0}, // ExpMark
- {FError, FExpInteger, FError, FError, 0}, // ExpSign
- {FError, FExpInteger, FError, FError, 1}, // ExpInteger
- {FError, FError, FError, FError, 0}, // Dead
- };
- // Returns 1 if float, 2 if integer, or 0 if neither
- int is_float(const char *string) {
- int state = FStart; // state
- while(*string) {
- char c = *(string++); // get a character
- if(c == '+' || c == '-')
- state = float_state_machine[state].plus_minus;
- else if(isdigit(c))
- state = float_state_machine[state].is_digit;
- else if(c == '.')
- state = float_state_machine[state].decimal_point;
- else if(c == 'E')
- state = float_state_machine[state].e;
- else
- state = FError;
- }
- return float_state_machine[state].valid;
- }
- // Data structure for symbol table entries
- struct symbol_data {
- const char *lexeme;
- int token_category;
- struct symbol_data *next;
- };
- struct symbol_data *symbol_table = NULL;
- // ----- TOKEN INFORMATION -----
- enum token_category {
- t_identifier, // an identifier
- t_integer, // any integer
- t_real, // any real number
- t_string, // a string literal
- t_addsub, // +, -
- t_muldiv, // *, /
- t_logical, // < > >= <= = <>
- t_lparen, // (
- t_rparen, // )
- t_lcurly, // {
- t_rcurly, // }
- t_lsquare, // [
- t_rsquare, // ]
- t_assignment, // :=
- t_comma, // ,
- t_if,
- t_then,
- t_else,
- t_do,
- t_while,
- t_return,
- t_bool,
- t_const,
- t_int,
- t_float,
- t_long,
- t_char,
- t_void,
- t_read,
- t_end_statement, // ;
- t_max_tokens,
- t_first_keyword = t_if,
- t_last_keyword = t_read,
- };
- // The token_strings list lays out the token category name (first element)
- // as well as the strings for each of the tokens in the category (all other elements).
- const char *token_strings[t_max_tokens][20] = {
- {"identifier", NULL}, {"integer", NULL},
- {"real", NULL}, {"string", NULL},
- {"add/sub", "+", "-", NULL},{"mul/div", "*", "/", NULL},
- {"logical", "<", ">", "<=", ">=", "=", "<>", NULL},
- {"lparen", "(", NULL}, {"rparen", ")", NULL},
- {"lcurly", "{", NULL}, {"rcurly", "}", NULL},
- {"lsquare", "[", NULL}, {"rsquare", "]", NULL},
- {"assignment", ":=", NULL}, {"comma", ",", NULL},
- {"if", "if", NULL}, {"then", "then", NULL}, {"else", "else", NULL},
- {"do", "do", NULL}, {"while", "while", NULL},
- {"return", "return", NULL}, {"bool", "bool", NULL},
- {"const", "const", NULL}, {"int", "int", NULL},
- {"float", "float", NULL}, {"long", "long", NULL},
- {"char", "char", NULL}, {"void", "void", NULL},
- {"read", "read", NULL}, {"end_statement", ";", NULL},
- };
- // Data structure for a token
- struct lexeme_token {
- int token_category; // which token category
- int token_value; // which token in the token category
- struct symbol_data *symbol; // symbol table entry
- struct lexeme_token *next;
- };
- // Makes a string representation of a token
- const char *token_print(struct lexeme_token *token) {
- static char buffer[50];
- if(token->symbol)
- sprintf(buffer, "(%s, %s)", token->symbol->lexeme, token_strings[token->token_category][0]);
- else if(!strcmp(token_strings[token->token_category][token->token_value], token_strings[token->token_category][0]))
- sprintf(buffer, "(%s)", token_strings[token->token_category][0]);
- else
- sprintf(buffer, "(%s, %s)", token_strings[token->token_category][token->token_value], token_strings[token->token_category][0]);
- return buffer;
- }
- // ----- STATE VARIABLES -----
- char lexeme[100] = ""; // buffer to accumulate the lexeme in
- int lexeme_index = 0; // index in the buffer
- struct lexeme_token *token_list = NULL;
- struct lexeme_token *token_current = NULL;
- FILE *program_file;
- int c; // current character
- // Error, so exit the whole program
- void error(const char *format, ...) {
- va_list argptr;
- va_start(argptr, format);
- printf("Error: ");
- vprintf(format, argptr);
- putchar('\n');
- va_end(argptr);
- exit(-1);
- }
- // ----- SYMBOL TABLE -----
- // Allocate a new symbol for the symbol table
- struct symbol_data *alloc_symbol(const char *lexeme, int token_category) {
- struct symbol_data *new_symbol;
- if(new_symbol == NULL)
- error("Couldn't allocate symbol");
- new_symbol = (struct symbol_data*)calloc(1, sizeof(struct symbol_data));
- new_symbol->lexeme = strdup(lexeme);
- new_symbol->token_category = token_category;
- new_symbol->next = NULL;
- return new_symbol;
- }
- // Find a symbol in the symbol table and optionally auto-create it if it's not found
- struct symbol_data *find_symbol(const char *lexeme, int token_category, int auto_create) {
- struct symbol_data *symbol;
- struct symbol_data *new_symbol;
- // If the symbol table is empty, handle that
- if(!symbol_table) {
- if(auto_create) {
- new_symbol = alloc_symbol(lexeme, token_category);
- symbol_table = new_symbol;
- return new_symbol;
- } else
- return NULL;
- }
- // Find the symbol if it exists
- struct symbol_data *previous = NULL;
- for(symbol = symbol_table; symbol; symbol = symbol->next) {
- previous = symbol;
- if(!strcmp(symbol->lexeme, lexeme) && symbol->token_category == token_category)
- return symbol;
- }
- // Symbol does not exist, can I create it?
- if(!auto_create)
- return NULL;
- // Create it, add it to the table
- new_symbol = alloc_symbol(lexeme, token_category);
- previous->next = new_symbol;
- return new_symbol;
- }
- // ----- LEXICAL ANALYZER -----
- // Reads the next character from the file
- void get_char() {
- c = fgetc(program_file);
- }
- // Adds a character, and ends the string there
- void add_char() {
- lexeme[lexeme_index++] = c;
- if(lexeme_index >= sizeof(lexeme)) {
- error("Lexeme is too long (%s)", lexeme);
- }
- lexeme[lexeme_index] = 0;
- c = 0;
- }
- // Clear the lexeme buffer
- void clear_lexeme() {
- lexeme[0] = 0;
- lexeme_index = 0;
- }
- // Is the lexeme a keyword?
- int is_keyword() {
- int i;
- for(i = t_first_keyword; i <= t_last_keyword; i++) {
- if(!strcmp(token_strings[i][0], lexeme)) {
- return i;
- }
- }
- return -1;
- }
- // Adds a token to the list of tokens
- void add_token(int token_category, int symbol) {
- // Allocate and set the token
- struct lexeme_token *token = (struct lexeme_token*)calloc(1, sizeof(struct lexeme_token));
- if(!token) {
- error("Can't allocate token");
- }
- if(symbol) {
- // If it's a symbol, maintain the symbol table
- token->symbol = find_symbol(lexeme, token_category, 1);
- } else {
- // If it's not a symbol, loop to find the correct token category subtype to use
- for(int i=0; token_strings[token_category][i]; i++)
- if(!strcmp(token_strings[token_category][i], lexeme)) {
- token->token_value = i;
- break;
- }
- }
- token->token_category = token_category;
- token->next = NULL;
- // Add it to the list
- if(token_current) {
- token_current->next = token;
- }
- token_current = token;
- // Start the list if it's not already started
- if(!token_list) {
- token_list = token;
- }
- // Get ready for the next lexeme
- clear_lexeme();
- }
- // Adds a token consisting of one character
- void add_char_token(int token_category) {
- add_char();
- add_token(token_category, 0);
- }
- // Run the lexical analyzer
- struct lexeme_token *lexical_analyzer(FILE *File) {
- program_file = File;
- token_list = NULL;
- token_current = NULL;
- clear_lexeme();
- // Lexical analyzer main loop
- while(1) {
- // Read a character if we don't have one
- if(c == 0) {
- get_char();
- }
- if(c == EOF) { // Stop if the end was reached
- break;
- }
- // Recognize all the different types of tokens
- if(isspace(c)) { // Ignore space characters
- c = 0;
- } else if(c == '-' || c =='+') {
- add_char_token(t_addsub);
- } else if(c == '*') {
- add_char_token(t_muldiv);
- } else if(c == '(') {
- add_char_token(t_lparen);
- } else if(c == ')') {
- add_char_token(t_rparen);
- } else if(c == '{') {
- add_char_token(t_lcurly);
- } else if(c == '}') {
- add_char_token(t_rcurly);
- } else if(c == '[') {
- add_char_token(t_lsquare);
- } else if(c == ']') {
- add_char_token(t_rsquare);
- } else if(c == ',') {
- add_char_token(t_comma);
- // Comparison operators
- } else if(c == '=') {
- add_char();
- add_token(t_logical, 0);
- } else if(c == '>') {
- add_char();
- get_char();
- if(c == '=') { // >=
- add_char();
- }
- add_token(t_logical, 0);
- } else if(c == '<') {
- add_char();
- get_char();
- if(c == '=' || c == '>') { // <= <>
- add_char();
- }
- add_token(t_logical, 0);
- // Statement separators
- } else if(c == ';') {
- add_char();
- add_token(t_end_statement, 0);
- // "strings"
- } else if(c == '\"') {
- add_char();
- int old_char; // add_char clears c, so keep a copy here
- do {
- get_char();
- old_char = c;
- add_char();
- } while(old_char != '\"' && old_char != EOF);
- add_token(t_string, 1);
- // := assignment statements
- } else if(c == ':') {
- add_char();
- get_char();
- if(c == '=') {
- add_char();
- add_token(t_assignment, 0);
- } else {
- error(": found, expected :=");
- }
- // identifiers or keywords
- } else if(isalpha(c) || c == '_') {
- while(isalnum(c) || c == '_') {
- add_char();
- get_char();
- }
- // Look up what keyword it is, if it's a keyword at all
- int keyword_num = is_keyword();
- if(keyword_num != -1) {
- add_token(keyword_num, 0);
- } else {
- add_token(t_identifier, 1);
- }
- // Integers and floats
- } else if(isdigit(c)) {
- while(isdigit(c) || c == 'E' || c == '.') {
- add_char();
- get_char();
- }
- int is_a_float = is_float(lexeme);
- if(is_a_float == 0)
- error("Invalid number %s", lexeme);
- if(is_a_float == 1) // 1 = float
- add_token(t_real, 1);
- if(is_a_float == 2) // 2 = integer
- add_token(t_integer, 1);
- // If a comment is found, skip to the end of the line
- } else if(c == '/') {
- get_char();
- if(c == '/') {
- do {
- get_char();
- } while(c != '\n' && c != EOF);
- } else { // if only one / then it's division
- clear_lexeme();
- c = '/';
- add_char_token(t_muldiv);
- }
- } else {
- error("Unexpected character %c", c);
- }
- }
- return token_list;
- }
- // ----- PARSE TREE FUNCTIONS -----
- struct syntax_node {
- struct lexeme_token *token;
- struct syntax_node *child, *next;
- };
- struct syntax_node *tree_head = NULL;
- struct syntax_node *tree_current = NULL;
- // Make a new tree node
- struct syntax_node *tree_new() {
- struct syntax_node *node = (struct syntax_node*)calloc(1, sizeof(struct syntax_node));
- node->token = token_current;
- return node;
- }
- // Directly set one node's child
- void tree_add_child(struct syntax_node *parent, struct syntax_node *child) {
- if(!parent->child) {
- // Put it directly in the child pointer if possible
- parent->child = child;
- } else {
- // If there's already a child, find the next free spot for it
- struct syntax_node *temp = tree_current->child;
- while(temp->next)
- temp = temp->next;
- temp->next = child;
- }
- }
- // Add a child to the current node and make that the new current node
- void tree_child() {
- struct syntax_node *node = tree_new();
- if(!tree_head) {
- // no head? set this as the new head
- tree_head = node;
- } else if(!tree_current) {
- // add another thing to global scope
- tree_current = tree_head;
- while(tree_current->next)
- tree_current = tree_current->next;
- tree_current->next = node;
- } else if(!tree_current->child) {
- // there's no child yet, add one
- tree_current->child = node;
- } else {
- // there's already a child, add a sibling
- tree_current = tree_current->child;
- while(tree_current->next)
- tree_current = tree_current->next;
- tree_current->next = node;
- }
- tree_current = node;
- }
- // ----- SYNTACTICAL ANALYZER -----
- // Moves onto the next token in teh program
- void next_token() {
- if(!token_current) {
- puts("Already at the end of the program");
- exit(0);
- }
- token_current = token_current->next;
- }
- enum {
- NEEDED = 1, // mandatory token
- OMIT = 2, // don't include in syntax tree
- TEST = 4, // just test, don't actually accept the token
- };
- // Accept a token from the program and move onto the next one
- int accept(int flags, ...) {
- int passing = 0;
- va_list argptr;
- va_start(argptr, flags);
- // Loop through the list of acceptable token types
- while(1) {
- int value = va_arg(argptr,int);
- if(value == -1) // -1 means end of list
- break;
- if(token_current->token_category == value)
- passing = 1;
- }
- va_end(argptr);
- // If only testing, only return the passing variable
- if(flags & TEST)
- return passing;
- // If it's needed, it's an error if it's not found
- if(!passing && (flags & NEEDED))
- error("Unexpected token, %s", token_strings[token_current->token_category][0]);
- // If it's passing, accept it and get the next token
- if(passing) {
- if(!(flags & OMIT))
- tree_child();
- next_token();
- }
- return passing;
- }
- void expression();
- // Allow there to be an array index after the identifier
- void array_index() {
- struct syntax_node *save = tree_current;
- if(accept(0, t_lsquare, -1)) {
- expression();
- accept(NEEDED|OMIT, t_rsquare, -1);
- }
- tree_current = save;
- }
- // Allow additional variables after this one, for as many commas there are
- void additional_variables() {
- struct syntax_node *save = tree_current;
- while(1) {
- tree_current = save;
- if(accept(OMIT, t_comma, -1)) {
- accept(NEEDED, t_identifier, -1);
- array_index();
- continue;
- }
- accept(NEEDED|OMIT, t_end_statement, -1);
- break;
- }
- tree_current = save;
- }
- // Factor, can be any identifier, number, or string
- void factor() {
- struct syntax_node *save = tree_current;
- if(accept(0, t_identifier, -1)) {
- array_index();
- if(accept(0, t_lparen, -1)) { // function call
- if(!accept(OMIT, t_rparen, -1)) {
- do {
- expression();
- } while(accept(OMIT, t_comma, -1));
- accept(NEEDED|OMIT, t_rparen, -1);
- }
- }
- } else if(accept(0, t_integer, t_real, t_string, -1)) {
- } else if(accept(0, t_lparen, -1)) {
- expression();
- accept(NEEDED|OMIT, t_rparen, -1);
- }
- tree_current = save;
- }
- // Allow multiplication
- void term() {
- struct syntax_node *save = tree_current;
- // make a temporary node to attach the left hand side onto
- struct syntax_node temp = {NULL, NULL, NULL};
- tree_current = &temp;
- factor();
- tree_current = save;
- struct syntax_node *operator = NULL;
- if(accept(0, t_muldiv, -1)) {
- tree_add_child(tree_current, temp.child);
- term();
- } else {
- tree_add_child(save, temp.child);
- }
- tree_current = save;
- }
- // The most outer level, with addition
- void expression() {
- struct syntax_node *save = tree_current;
- accept(0, t_addsub, -1); // optional sign
- // make a temporary node to attach the left hand side onto
- struct syntax_node temp = {NULL, NULL, NULL};
- tree_current = &temp;
- term();
- tree_current = save;
- struct syntax_node *operator = NULL;
- if(accept(0, t_addsub, -1)) {
- tree_add_child(tree_current, temp.child);
- expression();
- } else {
- tree_add_child(save, temp.child);
- }
- tree_current = save;
- }
- // A comparison of some sort
- void condition() {
- struct syntax_node *save = tree_current;
- // make a temporary node to attach the left hand side onto
- struct syntax_node temp = {NULL, NULL, NULL};
- tree_current = &temp;
- expression();
- tree_current = save;
- struct syntax_node *operator = NULL;
- accept(NEEDED, t_logical, -1);
- tree_add_child(tree_current, temp.child);
- expression();
- tree_current = save;
- }
- // One statement of any kind
- void statement() {
- struct syntax_node *save = tree_current;
- if(accept(0, t_const, -1)) {
- // Constant definition
- struct syntax_node *save_const = tree_current;
- accept(NEEDED, t_bool, t_int, t_float, t_long, t_char, -1);
- accept(NEEDED, t_identifier, -1);
- accept(NEEDED|OMIT, t_assignment, -1);
- expression();
- accept(NEEDED|OMIT, t_end_statement, -1);
- } else if(accept(0, t_bool, t_int, t_float, t_long, t_char, -1)) {
- // Variable declarations inside the function
- accept(NEEDED, t_identifier, -1);
- array_index();
- additional_variables();
- } else if(accept(0, t_read, -1)) {
- // Read one or more variables
- accept(NEEDED, t_identifier, -1);
- additional_variables();
- } else if(accept(0, t_identifier, -1)) {
- // Assignment or function call
- struct syntax_node *identifier = tree_current;
- // accept an array index if found
- int left_is_array = 0;
- struct syntax_node temp = {NULL, NULL, NULL};
- if(accept(TEST, t_lsquare, -1)) {
- tree_current = &temp;
- left_is_array = 1;
- array_index();
- tree_current = identifier;
- }
- if(accept(0, t_assignment, -1)) { // assignment
- struct syntax_node *assignment = tree_current;
- if(left_is_array)
- assignment->child = temp.child;
- struct lexeme_token *identifier_token = identifier->token;
- struct lexeme_token *assignment_token = assignment->token;
- // Swap the tokens
- identifier->token = assignment_token;
- assignment->token = identifier_token;
- tree_current = identifier;
- expression();
- } else if(accept(0, t_lparen, -1)) { // function call
- if(!accept(OMIT, t_rparen, -1)) {
- // If there's arguments,
- // read expressions until there's no more commas
- do {
- expression();
- } while(accept(OMIT, t_comma, -1));
- accept(NEEDED|OMIT, t_rparen, -1);
- }
- accept(NEEDED|OMIT, t_end_statement, -1);
- }
- } else if(accept(0, t_lcurly, -1)) {
- // Multiple statements
- while(!accept(OMIT, t_rcurly, -1))
- statement();
- } else if(accept(OMIT, t_end_statement, -1)) {
- // Empty statement
- } else if(accept(0, t_if, -1)) {
- condition();
- accept(NEEDED, t_then, -1);
- statement();
- } else if(accept(0, t_while, -1)) {
- condition();
- accept(NEEDED, t_do, -1);
- statement();
- } else if(accept(0, t_else, -1)) {
- statement();
- } else if(accept(0, t_return, -1)) {
- expression();
- accept(NEEDED|OMIT, t_end_statement, -1);
- } else {
- error("Bad token %s", token_print(token_current));
- }
- tree_current = save;
- }
- // Constant definition
- void constant() {
- struct syntax_node *save = tree_current;
- accept(NEEDED, t_bool, t_int, t_float, t_long, t_char, -1);
- accept(NEEDED, t_identifier, -1);
- accept(NEEDED|OMIT, t_assignment, -1);
- expression();
- accept(NEEDED|OMIT, t_end_statement, -1);
- tree_current = save;
- }
- // Definition of an identifier (variable or function)
- void identifier_definition() {
- struct syntax_node *save = tree_current;
- // variable or function declaration
- accept(NEEDED, t_identifier, -1); // name
- struct syntax_node *function_save = tree_current;
- if(accept(0, t_lparen, -1)) {
- // function definition, start of variable list
- if(accept(0, t_void, -1)) {
- // no parameters
- } else {
- struct syntax_node *parameter_save = tree_current;
- do {
- tree_current = parameter_save;
- accept(NEEDED, t_bool, t_int, t_float, t_long, t_char, t_void, -1);
- accept(NEEDED, t_identifier, -1);
- tree_current = parameter_save;
- } while(accept(OMIT, t_comma, -1)); // keep going if there's a comma
- }
- accept(NEEDED|OMIT, t_rparen, -1);
- tree_current = function_save;
- statement();
- } else {
- // variable declarations
- array_index();
- additional_variables();
- }
- tree_current = save;
- }
- // Run the syntactical analyzer
- void syntactical_analyzer(struct lexeme_token *list) {
- token_current = list;
- while(token_current) {
- tree_current = NULL; // NULL because it's in the global space
- // Get a type name
- if(accept(0, t_const, -1)) {
- constant();
- } else if(accept(NEEDED, t_bool, t_int, t_float, t_long, t_char, t_void, -1)) {
- identifier_definition();
- }
- }
- }
- // ----- MAIN PROGRAM -----
- // Prints out a parse tree graphically
- void print_parse_tree(struct syntax_node *node, int level) {
- while(node) {
- for(int i=0; i<level; i++)
- printf(" ");
- printf(token_print(node->token));
- putchar('\n');
- if(node->child)
- print_parse_tree(node->child, level+1);
- node = node->next;
- }
- }
- int main(int argc, char *argv[]) {
- FILE *File = fopen("sample.txt", "rb");
- struct lexeme_token *list = lexical_analyzer(File);
- puts("Token list:");
- for(token_current = token_list; token_current; token_current = token_current->next) {
- puts(token_print(token_current));
- }
- puts("\n\n\nSymbol table:");
- for(struct symbol_data *symbol = symbol_table; symbol; symbol = symbol->next) {
- printf("(%s, %s)\n", symbol->lexeme, token_strings[symbol->token_category][0]);
- }
- syntactical_analyzer(list);
- puts("\n\n\nSyntax tree:");
- print_parse_tree(tree_head, 0);
- fclose(File);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement