lalr parser

from collections import defaultdict
from copy import deepcopy

def convert_grammar_to_productions(file_path):
    with open(file_path, 'r') as file:
        grammar_content = file.read().strip()

    lines = [line.strip() for line in grammar_content.split('\n') if line.strip()]

    productions = []

    nonterminals = []
    for line in lines:
        left_side = line.split('->')[0].strip()
        if left_side not in nonterminals:
            nonterminals.append(left_side)

    all_symbols = set()
    for line in lines:
        left_side, right_side = [part.strip() for part in line.split('->')]

        alternatives = [alt.strip() for alt in right_side.split('|')]

        for alt in alternatives:
            symbols = alt.split()

            for symbol in symbols:
                if symbol != "":
                    all_symbols.add(symbol)

            productions.append((left_side, symbols))

    terminals = [symbol for symbol in all_symbols if symbol not in nonterminals]

    terminals.append('$')

    return productions, nonterminals, terminals

file_path = "text.txt"
productions, nonterminals, terminals = convert_grammar_to_productions(file_path)
print(productions)
print(nonterminals)
print(terminals)

def print_grammar(prods, title="Original Grammar"):
    print(f"\n{title}:")
    for lhs, rhs in prods:
        print(f"  {lhs} -> {' '.join(rhs)}")

def format_item(item):
    lhs, rhs, dot, lookahead = item
    rhs_list = list(rhs)
    rhs_with_dot = rhs_list[:]
    rhs_with_dot.insert(dot, "•")
    lookahead_str = "/".join(sorted(lookahead))
    return f"{lhs} -> {' '.join(rhs_with_dot)}, {{{lookahead_str}}}"

def print_state(state, state_id):
    print(f"State I{state_id}:")
    for item in state:
        print("  " + format_item(item))
    print()

def print_states(states, title="States"):
    print(f"\n{title}:")
    for i, state in enumerate(states):
        print_state(state, i)

def print_first_sets(first):
    print("\nFIRST Sets:")
    for sym in sorted(first.keys()):
        print(f"  FIRST({sym}) = {{ {', '.join(sorted(first[sym]))} }}")
    print()

def print_goto_transitions(merged_states, prods, first, nonterms, terms, core_to_index):
    print("\nGOTO Transitions (Merged LALR States):")
    def compute_merged_goto(state, symbol):
        g = set()
        for item in state:
            lhs, rhs, dot, lookahead = item
            if dot < len(rhs) and rhs[dot] == symbol:
                g.add((lhs, rhs, dot+1, lookahead))
        if not g:
            return None
        g = closure(g, prods, first, nonterms)
        core = frozenset((it[0], it[1], it[2]) for it in g)
        return core_to_index.get(core, None)

    all_symbols = nonterms + terms
    for i, state in enumerate(merged_states):
        for sym in all_symbols:
            tgt = compute_merged_goto(state, sym)
            if tgt is not None:
                print(f"  GOTO(I{i}, {sym}) = I{tgt}")
    print()

def print_parsing_table(table, terms, nonterms):
    headers = terms + ['$'] + nonterms
    header_line = "{:>8}" * (len(headers) + 1)
    print("\nLALR Parsing Table:")
    print(header_line.format("", *headers))
    for state in sorted(table.keys()):
        row = []
        for sym in headers:
            row.append(table[state].get(sym, ""))
        print(header_line.format(f"I{state}", *row))

def augment_grammar(prods, start_sym):
    new_start = start_sym + "'"
    while any(prod[0] == new_start for prod in prods):
        new_start += "'"
    augmented = [(new_start, [start_sym])] + prods
    return augmented, new_start

def compute_first(prods, nonterms, terms):
    first = {sym: set() for sym in nonterms}
    for t in terms:
        first[t] = {t}
    changed = True
    while changed:
        changed = False
        for lhs, rhs in prods:
            before = len(first[lhs])
            if rhs == ["^"]:
                first[lhs].add("^")
            else:
                for symbol in rhs:
                    first[lhs].update(first[symbol] - {"^"})
                    if "^" not in first[symbol]:
                        break
                else:
                    first[lhs].add("^")
            if len(first[lhs]) > before:
                changed = True
    return first

def first_of_string(symbols, first):
    result = set()
    for symbol in symbols:
        if symbol not in first:
            first[symbol] = {symbol}
        result.update(first[symbol] - {"^"})
        if "^" not in first[symbol]:
            break
    else:
        result.add("^")
    return result

def closure(items, prods, first, nonterms):
    closure_set = set(items)
    added = True
    while added:
        added = False
        new_items = set()
        for item in closure_set:
            lhs, rhs, dot, lookahead = item
            if dot < len(rhs):
                symbol = rhs[dot]
                if symbol in nonterms:
                    beta = list(rhs[dot+1:])
                    for prod in prods:
                        if prod[0] == symbol:
                            seq = beta + list(lookahead)
                            la = first_of_string(seq, first)
                            new_item = (symbol, tuple(prod[1]), 0, frozenset(la))
                            if new_item not in closure_set:
                                new_items.add(new_item)
        if new_items:
            closure_set |= new_items
            added = True
    return frozenset(closure_set)

def goto(items, symbol, prods, first, nonterms):
    goto_set = set()
    for item in items:
        lhs, rhs, dot, lookahead = item
        if dot < len(rhs) and rhs[dot] == symbol:
            goto_set.add((lhs, rhs, dot+1, lookahead))
    if not goto_set:
        return frozenset()
    return closure(goto_set, prods, first, nonterms)

def canonical_collection(prods, first, nonterms, terms, start_sym):
    C = []
    initial = closure({(start_sym, tuple(prods[0][1]), 0, frozenset({'$'}))}, prods, first, nonterms)
    C.append(initial)
    added = True
    while added:
        added = False
        for I in C.copy():
            for X in (nonterms + terms):
                goto_I = goto(I, X, prods, first, nonterms)
                if goto_I and goto_I not in C:
                    C.append(goto_I)
                    added = True
    return C

def merge_states(collection):
    merged = {}
    state_mapping = {}
    for i, I in enumerate(collection):
        core = frozenset((item[0], item[1], item[2]) for item in I)
        if core in merged:
            merged[core] = merged[core] | I
        else:
            merged[core] = I
        state_mapping[i] = core
    merged_states = list(merged.values())
    return merged_states, state_mapping

def construct_parsing_table(merged_states, prods, first, nonterms, terms, start_sym):

    core_to_index = {}
    for i, state in enumerate(merged_states):
        core = frozenset((item[0], item[1], item[2]) for item in state)
        core_to_index[core] = i

    table = {i: {} for i in range(len(merged_states))}

    def merged_goto(state, symbol):
        g = set()
        for item in state:
            lhs, rhs, dot, lookahead = item
            if dot < len(rhs) and rhs[dot] == symbol:
                g.add((lhs, rhs, dot+1, lookahead))
        if not g:
            return None
        g = closure(g, prods, first, nonterms)
        core = frozenset((it[0], it[1], it[2]) for it in g)
        return core_to_index.get(core, None)

    for i, state in enumerate(merged_states):
        for item in state:
            lhs, rhs, dot, lookahead = item
            if dot < len(rhs):
                a = rhs[dot]
                if a in terms:
                    j = merged_goto(state, a)
                    if j is not None:
                        table[i][a] = f"S{j}"
                elif a in nonterms:
                    j = merged_goto(state, a)
                    if j is not None:
                        table[i][a] = str(j)
            else:
                if lhs == start_sym and rhs == tuple(prods[0][1]):
                    for la in lookahead:
                        table[i][la] = "Acc"
                else:
                    prod_index = None
                    for index, prod in enumerate(prods):
                        if index == 0:
                            continue
                        if lhs == prod[0] and list(rhs) == prod[1]:
                            prod_index = index
                            break
                    if prod_index is None:
                        continue
                    for la in lookahead:
                        if la in table[i]:
                            table[i][la] += f"/R{prod_index}"
                        else:
                            table[i][la] = f"R{prod_index}"
    return table, core_to_index

class Node:
    def __init__(self, symbol, children=None):
        self.symbol = symbol
        self.children = children if children is not None else []
    def _str(self, level=0):
        ret = "  " * level + self.symbol + "\n"
        for child in self.children:
            ret += child._str(level + 1)
        return ret

def parse_input_string(input_string, table, augmented_productions, new_start):
    tokens = input_string.split() + ["$"]
    state_stack = [0]
    node_stack = []
    i = 0
    step = 1
    header = f"{'Step':<5}{'Stack':<20}{'Input':<30}{'Action':<15}"
    print("\n" + header)
    print("-" * len(header))
    while True:
        current_state = state_stack[-1]
        current_token = tokens[i]
        action = table[current_state].get(current_token, "")
        print(f"{step:<5}{str(state_stack):<20}{' '.join(tokens[i:]):<30}{action:<15}")
        if action == "":
            print(f"\nError: No action defined for state {current_state} and token {current_token}.")
            return
        if action.startswith("S"):
            next_state = int(action[1:])
            state_stack.append(next_state)
            node_stack.append(Node(current_token))
            i += 1
        elif action.startswith("R"):
            prod_index = int(action.split("/")[0][1:])
            lhs, rhs = augmented_productions[prod_index]
            num_to_pop = len(rhs)
            children = []
            for _ in range(num_to_pop):
                state_stack.pop()
                children.append(node_stack.pop())
            children.reverse()
            new_node = Node(lhs, children)
            node_stack.append(new_node)
            goto_action = table[state_stack[-1]].get(lhs, "")
            if goto_action == "":
                print(f"\nError: No goto action for state {state_stack[-1]} and nonterminal {lhs}.")
                return
            state_stack.append(int(goto_action))
        elif action == "Acc":
            print(f"\nParsing Successful!")
            break
        step += 1

    # After parsing is complete, print the parse tree
    print("\nParse Tree:")
    print(node_stack[-1]._str())


def main():
    global productions, nonterminals, terminals

    print_grammar(productions, "Original Grammar")

    augmented_productions, new_start = augment_grammar(productions, nonterminals[0])
    print_grammar(augmented_productions, "Augmented Grammar")

    if new_start not in nonterminals:
        nonterminals.insert(0, new_start)

    first_sets = compute_first(augmented_productions, nonterminals, terminals)
    if '$' not in first_sets:
        first_sets['$'] = {'$'}
    print_first_sets(first_sets)

    C = canonical_collection(augmented_productions, first_sets, nonterminals, terminals, new_start)
    print_states(C, "Canonical LR(1) Item Sets")

    merged_states, state_mapping = merge_states(C)
    print_states(merged_states, "Merged LALR States")

    table, core_to_index = construct_parsing_table(merged_states, augmented_productions, first_sets, nonterminals, terminals, new_start)

    print_goto_transitions(merged_states, augmented_productions, first_sets, nonterminals, terminals, core_to_index)

    print_parsing_table(table, terminals, nonterminals)

    input_str = input("Input string for parsing: ")
    print("\nParsing Input String:", input_str)
    parse_input_string(input_str, table, augmented_productions, new_start)

if __name__ == "__main__":
    main()