Advertisement
fooker

lalr parser

Feb 25th, 2025
233
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.46 KB | None | 0 0
  1. from collections import defaultdict
  2. from copy import deepcopy
  3.  
  4. def convert_grammar_to_productions(file_path):
  5.     with open(file_path, 'r') as file:
  6.         grammar_content = file.read().strip()
  7.    
  8.     lines = [line.strip() for line in grammar_content.split('\n') if line.strip()]
  9.    
  10.     productions = []
  11.    
  12.     nonterminals = []
  13.     for line in lines:
  14.         left_side = line.split('->')[0].strip()
  15.         if left_side not in nonterminals:
  16.             nonterminals.append(left_side)
  17.    
  18.     all_symbols = set()
  19.     for line in lines:
  20.         left_side, right_side = [part.strip() for part in line.split('->')]
  21.        
  22.         alternatives = [alt.strip() for alt in right_side.split('|')]
  23.        
  24.         for alt in alternatives:
  25.             symbols = alt.split()
  26.            
  27.             for symbol in symbols:
  28.                 if symbol != "":
  29.                     all_symbols.add(symbol)
  30.            
  31.             productions.append((left_side, symbols))
  32.    
  33.     terminals = [symbol for symbol in all_symbols if symbol not in nonterminals]
  34.    
  35.     terminals.append('$')
  36.    
  37.     return productions, nonterminals, terminals
  38.  
  39. file_path = "text.txt"
  40. productions, nonterminals, terminals = convert_grammar_to_productions(file_path)
  41. print(productions)
  42. print(nonterminals)
  43. print(terminals)
  44.  
  45. def print_grammar(prods, title="Original Grammar"):
  46.     print(f"\n{title}:")
  47.     for lhs, rhs in prods:
  48.         print(f"  {lhs} -> {' '.join(rhs)}")
  49.  
  50. def format_item(item):
  51.     lhs, rhs, dot, lookahead = item
  52.     rhs_list = list(rhs)
  53.     rhs_with_dot = rhs_list[:]
  54.     rhs_with_dot.insert(dot, "•")
  55.     lookahead_str = "/".join(sorted(lookahead))
  56.     return f"{lhs} -> {' '.join(rhs_with_dot)}, {{{lookahead_str}}}"
  57.  
  58. def print_state(state, state_id):
  59.     print(f"State I{state_id}:")
  60.     for item in state:
  61.         print("  " + format_item(item))
  62.     print()
  63.  
  64. def print_states(states, title="States"):
  65.     print(f"\n{title}:")
  66.     for i, state in enumerate(states):
  67.         print_state(state, i)
  68.  
  69. def print_first_sets(first):
  70.     print("\nFIRST Sets:")
  71.     for sym in sorted(first.keys()):
  72.         print(f"  FIRST({sym}) = {{ {', '.join(sorted(first[sym]))} }}")
  73.     print()
  74.  
  75. def print_goto_transitions(merged_states, prods, first, nonterms, terms, core_to_index):
  76.     print("\nGOTO Transitions (Merged LALR States):")
  77.     def compute_merged_goto(state, symbol):
  78.         g = set()
  79.         for item in state:
  80.             lhs, rhs, dot, lookahead = item
  81.             if dot < len(rhs) and rhs[dot] == symbol:
  82.                 g.add((lhs, rhs, dot+1, lookahead))
  83.         if not g:
  84.             return None
  85.         g = closure(g, prods, first, nonterms)
  86.         core = frozenset((it[0], it[1], it[2]) for it in g)
  87.         return core_to_index.get(core, None)
  88.    
  89.     all_symbols = nonterms + terms
  90.     for i, state in enumerate(merged_states):
  91.         for sym in all_symbols:
  92.             tgt = compute_merged_goto(state, sym)
  93.             if tgt is not None:
  94.                 print(f"  GOTO(I{i}, {sym}) = I{tgt}")
  95.     print()
  96.  
  97. def print_parsing_table(table, terms, nonterms):
  98.     headers = terms + ['$'] + nonterms
  99.     header_line = "{:>8}" * (len(headers) + 1)
  100.     print("\nLALR Parsing Table:")
  101.     print(header_line.format("", *headers))
  102.     for state in sorted(table.keys()):
  103.         row = []
  104.         for sym in headers:
  105.             row.append(table[state].get(sym, ""))
  106.         print(header_line.format(f"I{state}", *row))
  107.  
  108. def augment_grammar(prods, start_sym):
  109.     new_start = start_sym + "'"
  110.     while any(prod[0] == new_start for prod in prods):
  111.         new_start += "'"
  112.     augmented = [(new_start, [start_sym])] + prods
  113.     return augmented, new_start
  114.  
  115. def compute_first(prods, nonterms, terms):
  116.     first = {sym: set() for sym in nonterms}
  117.     for t in terms:
  118.         first[t] = {t}
  119.     changed = True
  120.     while changed:
  121.         changed = False
  122.         for lhs, rhs in prods:
  123.             before = len(first[lhs])
  124.             if rhs == ["^"]:
  125.                 first[lhs].add("^")
  126.             else:
  127.                 for symbol in rhs:
  128.                     first[lhs].update(first[symbol] - {"^"})
  129.                     if "^" not in first[symbol]:
  130.                         break
  131.                 else:
  132.                     first[lhs].add("^")
  133.             if len(first[lhs]) > before:
  134.                 changed = True
  135.     return first
  136.  
  137. def first_of_string(symbols, first):
  138.     result = set()
  139.     for symbol in symbols:
  140.         if symbol not in first:
  141.             first[symbol] = {symbol}
  142.         result.update(first[symbol] - {"^"})
  143.         if "^" not in first[symbol]:
  144.             break
  145.     else:
  146.         result.add("^")
  147.     return result
  148.  
  149. def closure(items, prods, first, nonterms):
  150.     closure_set = set(items)
  151.     added = True
  152.     while added:
  153.         added = False
  154.         new_items = set()
  155.         for item in closure_set:
  156.             lhs, rhs, dot, lookahead = item
  157.             if dot < len(rhs):
  158.                 symbol = rhs[dot]
  159.                 if symbol in nonterms:
  160.                     beta = list(rhs[dot+1:])
  161.                     for prod in prods:
  162.                         if prod[0] == symbol:
  163.                             seq = beta + list(lookahead)
  164.                             la = first_of_string(seq, first)
  165.                             new_item = (symbol, tuple(prod[1]), 0, frozenset(la))
  166.                             if new_item not in closure_set:
  167.                                 new_items.add(new_item)
  168.         if new_items:
  169.             closure_set |= new_items
  170.             added = True
  171.     return frozenset(closure_set)
  172.  
  173. def goto(items, symbol, prods, first, nonterms):
  174.     goto_set = set()
  175.     for item in items:
  176.         lhs, rhs, dot, lookahead = item
  177.         if dot < len(rhs) and rhs[dot] == symbol:
  178.             goto_set.add((lhs, rhs, dot+1, lookahead))
  179.     if not goto_set:
  180.         return frozenset()
  181.     return closure(goto_set, prods, first, nonterms)
  182.  
  183. def canonical_collection(prods, first, nonterms, terms, start_sym):
  184.     C = []
  185.     initial = closure({(start_sym, tuple(prods[0][1]), 0, frozenset({'$'}))}, prods, first, nonterms)
  186.     C.append(initial)
  187.     added = True
  188.     while added:
  189.         added = False
  190.         for I in C.copy():
  191.             for X in (nonterms + terms):
  192.                 goto_I = goto(I, X, prods, first, nonterms)
  193.                 if goto_I and goto_I not in C:
  194.                     C.append(goto_I)
  195.                     added = True
  196.     return C
  197.  
  198. def merge_states(collection):
  199.     merged = {}
  200.     state_mapping = {}
  201.     for i, I in enumerate(collection):
  202.         core = frozenset((item[0], item[1], item[2]) for item in I)
  203.         if core in merged:
  204.             merged[core] = merged[core] | I
  205.         else:
  206.             merged[core] = I
  207.         state_mapping[i] = core
  208.     merged_states = list(merged.values())
  209.     return merged_states, state_mapping
  210.  
  211. def construct_parsing_table(merged_states, prods, first, nonterms, terms, start_sym):
  212.  
  213.     core_to_index = {}
  214.     for i, state in enumerate(merged_states):
  215.         core = frozenset((item[0], item[1], item[2]) for item in state)
  216.         core_to_index[core] = i
  217.  
  218.     table = {i: {} for i in range(len(merged_states))}
  219.  
  220.     def merged_goto(state, symbol):
  221.         g = set()
  222.         for item in state:
  223.             lhs, rhs, dot, lookahead = item
  224.             if dot < len(rhs) and rhs[dot] == symbol:
  225.                 g.add((lhs, rhs, dot+1, lookahead))
  226.         if not g:
  227.             return None
  228.         g = closure(g, prods, first, nonterms)
  229.         core = frozenset((it[0], it[1], it[2]) for it in g)
  230.         return core_to_index.get(core, None)
  231.  
  232.     for i, state in enumerate(merged_states):
  233.         for item in state:
  234.             lhs, rhs, dot, lookahead = item
  235.             if dot < len(rhs):
  236.                 a = rhs[dot]
  237.                 if a in terms:
  238.                     j = merged_goto(state, a)
  239.                     if j is not None:
  240.                         table[i][a] = f"S{j}"
  241.                 elif a in nonterms:
  242.                     j = merged_goto(state, a)
  243.                     if j is not None:
  244.                         table[i][a] = str(j)
  245.             else:
  246.                 if lhs == start_sym and rhs == tuple(prods[0][1]):
  247.                     for la in lookahead:
  248.                         table[i][la] = "Acc"
  249.                 else:
  250.                     prod_index = None
  251.                     for index, prod in enumerate(prods):
  252.                         if index == 0:
  253.                             continue
  254.                         if lhs == prod[0] and list(rhs) == prod[1]:
  255.                             prod_index = index
  256.                             break
  257.                     if prod_index is None:
  258.                         continue
  259.                     for la in lookahead:
  260.                         if la in table[i]:
  261.                             table[i][la] += f"/R{prod_index}"
  262.                         else:
  263.                             table[i][la] = f"R{prod_index}"
  264.     return table, core_to_index
  265.  
  266. class Node:
  267.     def __init__(self, symbol, children=None):
  268.         self.symbol = symbol
  269.         self.children = children if children is not None else []
  270.     def _str(self, level=0):
  271.         ret = "  " * level + self.symbol + "\n"
  272.         for child in self.children:
  273.             ret += child._str(level + 1)
  274.         return ret
  275.  
  276. def parse_input_string(input_string, table, augmented_productions, new_start):
  277.     tokens = input_string.split() + ["$"]
  278.     state_stack = [0]    
  279.     node_stack = []        
  280.     i = 0
  281.     step = 1
  282.     header = f"{'Step':<5}{'Stack':<20}{'Input':<30}{'Action':<15}"
  283.     print("\n" + header)
  284.     print("-" * len(header))
  285.     while True:
  286.         current_state = state_stack[-1]
  287.         current_token = tokens[i]
  288.         action = table[current_state].get(current_token, "")
  289.         print(f"{step:<5}{str(state_stack):<20}{' '.join(tokens[i:]):<30}{action:<15}")
  290.         if action == "":
  291.             print(f"\nError: No action defined for state {current_state} and token {current_token}.")
  292.             return
  293.         if action.startswith("S"):
  294.             next_state = int(action[1:])
  295.             state_stack.append(next_state)
  296.             node_stack.append(Node(current_token))
  297.             i += 1
  298.         elif action.startswith("R"):
  299.             prod_index = int(action.split("/")[0][1:])
  300.             lhs, rhs = augmented_productions[prod_index]
  301.             num_to_pop = len(rhs)
  302.             children = []
  303.             for _ in range(num_to_pop):
  304.                 state_stack.pop()
  305.                 children.append(node_stack.pop())
  306.             children.reverse()
  307.             new_node = Node(lhs, children)
  308.             node_stack.append(new_node)
  309.             goto_action = table[state_stack[-1]].get(lhs, "")
  310.             if goto_action == "":
  311.                 print(f"\nError: No goto action for state {state_stack[-1]} and nonterminal {lhs}.")
  312.                 return
  313.             state_stack.append(int(goto_action))
  314.         elif action == "Acc":
  315.             print(f"\nParsing Successful!")
  316.             break
  317.         step += 1
  318.  
  319.     # After parsing is complete, print the parse tree
  320.     print("\nParse Tree:")
  321.     print(node_stack[-1]._str())
  322.  
  323.  
  324. def main():
  325.     global productions, nonterminals, terminals
  326.    
  327.     print_grammar(productions, "Original Grammar")
  328.    
  329.     augmented_productions, new_start = augment_grammar(productions, nonterminals[0])
  330.     print_grammar(augmented_productions, "Augmented Grammar")
  331.    
  332.     if new_start not in nonterminals:
  333.         nonterminals.insert(0, new_start)
  334.    
  335.     first_sets = compute_first(augmented_productions, nonterminals, terminals)
  336.     if '$' not in first_sets:
  337.         first_sets['$'] = {'$'}
  338.     print_first_sets(first_sets)
  339.    
  340.     C = canonical_collection(augmented_productions, first_sets, nonterminals, terminals, new_start)
  341.     print_states(C, "Canonical LR(1) Item Sets")
  342.    
  343.     merged_states, state_mapping = merge_states(C)
  344.     print_states(merged_states, "Merged LALR States")
  345.    
  346.     table, core_to_index = construct_parsing_table(merged_states, augmented_productions, first_sets, nonterminals, terminals, new_start)
  347.    
  348.     print_goto_transitions(merged_states, augmented_productions, first_sets, nonterminals, terminals, core_to_index)
  349.    
  350.     print_parsing_table(table, terminals, nonterminals)
  351.  
  352.     input_str = input("Input string for parsing: ")
  353.     print("\nParsing Input String:", input_str)
  354.     parse_input_string(input_str, table, augmented_productions, new_start)
  355.  
  356. if __name__ == "__main__":
  357.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement