Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """CBM BASIC V2 (de)tokenization.
- :todo: Repeat syntax in escape codes like ``{017*3}``.
- :todo: Unicode translation of escape codes and characters to get sources
- looking more like the original. ``£`` instead of a backslash,
- ``←`` instead of ``_`` and so on.
- :todo: Choose between lower/upper case character sets.
- :todo: Allow keyword escape sequences like ``{print}``.
- :todo: Option to list keywords as escape sequences.
- :todo: Documentation.
- :todo: Let user set start address.
- :todo: Factor out escape sequence delimiters as constants.
- :todo: Options to list keywords and escape sequence names.
- :todo: Warn if line numbers are not in ascending order.
- """
- import logging
- import os
- import re
- import sys
- from functools import partial
- from itertools import imap
- from optparse import OptionParser
- __all__ = ['detokenize', 'tokenize']
- __author__ = "Marc 'BlackJack' Rintsch (marc[at]rintsch[dot]de)"
- __version__ = '0.0.0'
- logging.basicConfig(level=logging.INFO)
- LOG = logging.getLogger(os.path.basename(__file__))
- #:
- #: Token value of a keyword in the list is the keyword's index + 128.
- #:
- BASIC_V2_TOKENS = [
- 'end', 'for', 'next', 'data', 'input#', 'input', 'dim', 'read', 'let',
- 'goto', 'run', 'if', 'restore', 'gosub', 'return', 'rem', 'stop', 'on',
- 'wait', 'load', 'save', 'verify', 'def', 'poke', 'print#', 'print', 'cont',
- 'list', 'clr', 'cmd', 'sys', 'open', 'close', 'get', 'new', 'tab(', 'to',
- 'fn', 'spc(', 'then', 'not', 'step', '+', '-', '*', '/', '^', 'and', 'or',
- '>', '=', '<', 'sgn', 'int', 'abs', 'usr', 'fre', 'pos', 'sqr', 'rnd',
- 'log', 'exp', 'cos', 'sin', 'tan', 'atn', 'peek', 'len', 'str$', 'val',
- 'asc', 'chr$', 'left$', 'right$', 'mid$', 'go'
- ]
- #:
- #: Mapping of keyword to token value.
- #:
- KEYWORD2VALUE = dict(
- (t, chr(v)) for v, t in enumerate(BASIC_V2_TOKENS, 128)
- )
- #:
- #: Regular expression for tokenizing the part after the line number.
- #:
- #: Matches all keywords, strings, and escape sequences outside strings.
- #:
- TOKENIZE_RE = re.compile(
- '|'.join(
- re.escape(k) for k in sorted(BASIC_V2_TOKENS, key=len, reverse=True)
- )
- + r'|"[^"]*"|\{[^\}]*\}'
- )
- #:
- #: Regular expression to match line numbers at the beginning of a line.
- #:
- LINE_NUMBER_RE = re.compile(r'\s*\d+')
- ESCAPE_NAMES = [
- (
- 0,
- [
- 'null', 'ctrl a', 'ctrl b', 'ctrl c', 'ctrl d', 'white', 'ctrl f',
- 'ctrl g', 'up/lo lock on', 'up/lo lock off', 'ctrl j', 'ctrl k',
- 'ctrl l', 'return', 'lower case', 'ctrl o', 'ctrl p', 'down',
- 'reverse on', 'home', 'delete', 'ctrl u', 'ctrl v', 'ctrl w',
- 'ctrl x', 'ctrl y', 'ctrl z', 'esc', 'red', 'right', 'green', 'blue'
- ]
- ),
- (92, 'pound'),
- (95, 'arrow left'),
- (129, 'orange'),
- (
- 133,
- ['f1', 'f3','f5', 'f7', 'f2', 'f4', 'f8', 'shift return', 'upper case']
- ),
- (
- 144,
- [
- 'black', 'up', 'reverse off', 'clear', 'insert', 'brown',
- 'light red', 'dark gray', 'gray', 'light green', 'light blue',
- 'light gray', 'purple', 'left', 'yellow', 'cyan', 'shift space',
- 'cbm k', 'cbm i', 'cbm t', 'cbm @', 'cbm g', 'cbm +', 'cbm m',
- 'cbm pound', 'shift pound', 'cbm n', 'cbm q', 'cbm d', 'cbm z',
- 'cbm s', 'cbm p', 'cbm a', 'cbm e', 'cbm r', 'cbm w', 'cbm h',
- 'cbm j', 'cbm l', 'cbm y', 'cbm u', 'cbm d', 'shift @', 'cbm f',
- 'cbm c', 'cbm x', 'cbm v', 'cbm b', 'shift asterisk'
- ]
- ),
- (219, ['shift +', 'cbm -', 'shift -']),
- (223, 'cbm asterisk'),
- (255, 'pi'),
- ]
- def _prepare_escape_name_mapping():
- result = dict()
- for value, names in ESCAPE_NAMES:
- if isinstance(names, basestring):
- names = [names]
- result.update((n, chr(v)) for v, n in enumerate(names, value))
- return result
- ESCAPE_NAME_TO_VALUE = _prepare_escape_name_mapping()
- VALUE_TO_UNICODE = {
- '\x5c': u'£',
- '\x5e': u'↑',
- '\x5f': u'←',
- '\xff': u'π',
- }
- def _prepare_unicode_escape_mapping():
- value2escape = dict((v, e) for e, v in ESCAPE_NAME_TO_VALUE.iteritems())
- result = dict()
- for value, unichar in VALUE_TO_UNICODE.iteritems():
- escape = value2escape.get(value)
- result[ord(unichar)] = u'{%s}' % escape if escape else unicode(value)
- return result
- UNICODE_TO_ESCAPE = _prepare_unicode_escape_mapping()
- class Error(Exception):
- pass
- class TokenizeError(Error):
- pass
- class Detokenizer(object):
- def __init__(self):
- self.is_in_string_mode = False
- #
- # Base mapping: Byte value to numeric escape sequence.
- #
- self.string_mapping = dict((chr(i), '{%03d}' % i) for i in xrange(256))
- #
- # Range 32--64 is a 1:1 mapping.
- #
- self.add_range(32, 64, 32)
- #
- # Unshiftet characters are mapped to lower case ASCII characters.
- #
- self.add_range(65, 90, 97)
- #
- # Shifted characters are mapped to upper case ASCII characters.
- #
- self.add_range(97, 127, 65)
- #
- # Values above 191 are copies from lower ranges.
- #
- self.copy_range(192, 233, 96)
- self.copy_range(224, 254, 160)
- self.copy_range(255, 255, 126)
- #
- # Add symbolic names.
- #
- for name, value in ESCAPE_NAME_TO_VALUE.iteritems():
- self.string_mapping[value] = '{%s}' % name
- self.code_mapping = dict(
- (chr(i), kw) for i, kw in enumerate(BASIC_V2_TOKENS, 128)
- )
- def add_range(self, start, end, value_start):
- offset = value_start - start
- self.string_mapping.update(
- (chr(i), chr(i + offset)) for i in xrange(start, end + 1)
- )
- def copy_range(self, to_start, to_end, from_start):
- offset = from_start - to_start
- self.string_mapping.update(
- (chr(i), self.string_mapping[chr(i + offset)])
- for i in xrange(to_start, to_end + 1)
- )
- def decode(self, value):
- if value == '"':
- self.is_in_string_mode = not self.is_in_string_mode
- if not self.is_in_string_mode:
- try:
- return self.code_mapping[value]
- except KeyError:
- pass # Intentionally ignored.
- return self.string_mapping[value]
- def detokenize_line(self, line):
- self.is_in_string_mode = False
- return ''.join(imap(self.decode, line))
- def detokenize(self, basic_prg, offset=2):
- i = offset
- while get_uint16(basic_prg, i) != 0:
- i += 2 # Skip pointer to next BASIC line.
- line_number = get_uint16(basic_prg, i)
- i += 2 # Skip line number.
- j = basic_prg.index('\0', i)
- tokenized_line = basic_prg[i:j]
- yield (line_number, self.detokenize_line(tokenized_line))
- i = j + 1 # Skip to next line.
- def get_uint16(data, offset):
- return ord(data[offset]) | ord(data[offset + 1]) << 8
- def int2uint16(value):
- if not (0 <= value < 2**16):
- raise ValueError('integer not in range 0..65535')
- high, low = divmod(value, 256)
- return chr(low) + chr(high)
- def detokenize(basic_prg, offset=2):
- return Detokenizer().detokenize(basic_prg, offset)
- def tokenize_escape(name):
- try:
- return ESCAPE_NAME_TO_VALUE[name]
- except KeyError:
- return chr(int(name))
- def tokenize_raw_string(string):
- result = list()
- for value in string:
- tmp = ord(value)
- if tmp == 126:
- value = '\xff'
- elif 'a' <= value <= 'z':
- value = value.upper()
- elif 'A' <= value <= 'Z':
- value = chr(ord(value.lower()) + 96)
- result.append(value)
- return ''.join(result)
- def _tokenize(regex, process_match, process_rest, string):
- result = list()
- i = j = 0
- for match in regex.finditer(string):
- j = match.start()
- if i != j:
- result.append(process_rest(string[i:j]))
- i = match.end()
- data, finished = process_match(match)
- result.append(data)
- if finished:
- i = len(string)
- break
- if i != len(string):
- result.append(process_rest(string[i:]))
- return ''.join(result)
- tokenize_string = partial(
- _tokenize,
- re.compile(r'\{[^\}]*\}'),
- lambda m: (tokenize_escape(m.group()[1:-1]), False),
- tokenize_raw_string
- )
- def find_command_separator(line, offset):
- for match in re.compile(r'"[^"]*?"|:').finditer(line, offset):
- if match.group() == ':':
- return match.end()
- return len(line)
- def tokenize_line_content(line):
- def process_match(match):
- result = list()
- data = match.group()
- try:
- result.append(KEYWORD2VALUE[data])
- except KeyError:
- if data.startswith('"'):
- result.append(tokenize_string(data))
- elif data.startswith('{'):
- result.append(tokenize_escape(data[1:-1]))
- else:
- assert False
- else:
- if data == 'rem':
- result.append(tokenize_string(line[match.end():]))
- elif data == 'data':
- next_command_offset = find_command_separator(line, match.end())
- result.append(
- tokenize_string(line[match.end():next_command_offset])
- )
- result.append(tokenize_line_content(line[next_command_offset:]))
- return (''.join(result), data in ['rem', 'data'])
- return _tokenize(TOKENIZE_RE, process_match, lambda s: s.upper(), line)
- def tokenize_line(line):
- line_number_match = LINE_NUMBER_RE.match(line)
- if not line_number_match:
- raise TokenizeError('no line number found: %r' % line)
- line_number = int(line_number_match.group())
- if line_number >= 2**16:
- raise TokenizeError('line number too high (%d)' % line_number)
- if line_number >= 64000:
- LOG.warn('line number %d > 64000', line_number)
- line = line[line_number_match.end():].lstrip().rstrip('\r\n')
- if not line.strip():
- LOG.warn('empty line (%d)', line_number)
- return int2uint16(int(line_number)) + tokenize_line_content(line)
- def tokenize(lines, address=0x0801):
- result = [int2uint16(address)]
- for line in lines:
- if line.strip():
- tokenized_line = tokenize_line(line)
- address += len(tokenized_line) + 3
- result.append(int2uint16(address) + tokenized_line + '\0')
- result.append(int2uint16(0))
- return ''.join(result)
- def ascii_to_utf8(string):
- def replace_func(match):
- escape = match.group()
- return VALUE_TO_UNICODE.get(tokenize_escape(escape[1:-1]), escape)
- return re.sub(r'\{[^}]+\}', replace_func, string).encode('utf-8')
- def utf8_to_ascii(string):
- return string.decode('utf-8').translate(UNICODE_TO_ESCAPE).encode('ascii')
- def main():
- parser = OptionParser(
- usage='%prog [options] detokenize|tokenize',
- version=__version__,
- description='CBM BASIC V2 (de)tokenizer.',
- epilog='Written by %s.' % __author__
- )
- parser.add_option('-i', '--input', metavar='FILE', action='store',
- default=None, help='input filename (default: <stdin>)')
- parser.add_option('-o', '--output', metavar='FILE', action='store',
- default=None, help='ouput filename (default: <stdout>)')
- parser.add_option('--unicode', action='store_true', default=False,
- help='convert certain charaters and escape sequences to/from unicode.'
- ' The encoding is UTF-8.')
- parser.add_option('--debug', action='store_true', default=False,
- help='activate debugging output like complete stack traces')
- options, arguments = parser.parse_args()
- if options.debug:
- LOG.setLevel(logging.DEBUG)
- if not arguments or arguments[0] not in ['detokenize', 'tokenize']:
- parser.error('please choose operation mode')
- try:
- in_file = open(options.input, 'rb') if options.input else sys.stdin
- out_file = open(options.input, 'wb') if options.output else sys.stdout
- command = arguments[0]
- if command == 'detokenize':
- for line_number, line in detokenize(in_file.read()):
- if options.unicode:
- line = ascii_to_utf8(line)
- out_file.write('%5d %s\n' % (line_number, line))
- elif command == 'tokenize':
- if options.unicode:
- in_file = imap(utf8_to_ascii, in_file)
- out_file.write(tokenize(in_file))
- else:
- assert False, 'command %r should not be possible' % command
- except Exception, error:
- if options.debug:
- raise
- else:
- parser.error(error)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement