Sunday, August 8

A Python Syntax Highlighter for HTML, Written in Python

As of late, I wanted to post some python code on my blog and the formatting options I found kind of sucked (actually I was getting bored and looked for excuses to roll my own).

I had a look at some options, but the most known, either had to be installed locally (and I didn't want to do that), or they worked online, but the generated code was incomplete or the formatting sucked (in my self-absorbed opinion).

Long story short, I wrote my own, in Python.

The formater generates HTML code separating operators, keywords, tokens (identifiers, variables and numbers), comments and text.

The colors for these can be set in the COLORS global map.

Here's the code (the formatter err ... formatted itself for this post, as a test):

import sys
import re

# alphanumeric character escaped
# for regular expressions
ALPHANUMS = 'a-zA-Z0-9_'

# python operators, escaped
# for regular expressions
OPERATORS = r'\.\(\)\[\]\{\}\:\'\"\!=,%\+\-\*\/\^\&<>'

# python keywords
KEYWORDS = ['if', 'for', 'while', 'do', 'def',
    'class', 'None', 'True', 'False', 'and',
    'or', 'not', 'import', 'else', 'elif',
    'raise', 'except', 'break',    'continue',
    'lambda', 'return', 'yield', 'global']

COLORS = {
    'bg':'#030303',
    'kw':'blue',
    'tk':'silver',
    'op':'teal',
    'cm':'green',
    'txt':'red',
    'qu':'darkred',
    '??':'white'}

# how many spaces should a tab character be:
TAB_LENGTH = 4

escape = lambda source, replacements: \
    ''.join( [ c if c not in replacements \
        else replacements[c] \
        for c in source ] )

# escape a string so it will be parsable
# by the re module
def rxEscaped(source):
    replacements = {
        r'[':r'\[',
        r']':r'\]',
        r'{':r'\{',
        r'}':r'\}'}
    return escape(source, replacements)

SPACES = ' \t'
SPLITTERS = rxEscaped( OPERATORS + SPACES )

class TokensList(list):

    replacements = {
        '<':'&lt;',
        '>':'&gt;',
        '&':'&amp;',
        ' ':'&nbsp;',
        '\t':'&nbsp;'*TAB_LENGTH}

    def __init__(self):
        list.__init__(self)

    def add(self, key, value):
        self.append(
            (key, \
            escape(value,
                TokensList.replacements)) )


class Tokenizer(object):
    '''Transforms a line in a group of tokens,
    where each token is represented by a pair:
        line -> [ (key, token), (key, token), ... ]

    The key represents the type of the token:
        kw = keyword
        tk = word
        sp = spacing
        op = operator
        cm = comment
        qu = quotes
        txt = string contents
        ?? = unidentified (for any errors)

    '''

    rxToken = re.compile(r'''
        ^(?P<tk>[%s]+)        # string token
        |(?P<sp>[%s]{1})    # or space
        |(?P<op>[%s]{1})    # or operators
        |(?P<cm>\#.*$)        # or comment
        ''' % (ALPHANUMS, SPACES, OPERATORS), \
        re.VERBOSE)

    quotes = None

    def __init__(self):
        self._init()

    def _init(self, line=''):
        self.line, self.parsed = line, line
        self.tokens = TokensList()

    def _parseRx(self, rx):
        '''Search for rx against line and return
        the found match and the groups dict.

        '''
        found = re.search(rx, self.parsed)
        if found:
            return (found, found.groupdict())
        else:
            return (None, None)

    def _parseStringStart(self):
        quotes = \
            self._parseRx(r'^(?P<qu>[\']{3})')[1] or \
            self._parseRx(r'^(?P<qu>[\"]{3})')[1] or \
            self._parseRx(r'^(?P<qu>\')')[1] or \
            self._parseRx(r'^(?P<qu>\")')[1]
        if quotes:
            quotes = quotes['qu']
            self.parsed = self.parsed[len(quotes):]
            Tokenizer.quotes = quotes
            return True
        return False

    def _getTokens(self):

        while self.parsed:
            ## are we in a string?
            if Tokenizer.quotes:
                if len(Tokenizer.quotes) == 3:
                    # rx for multiline string
                    rx = '(?P<qu>%s)' % \
                        (('\\' + Tokenizer.quotes[0])*3)
                else:
                    # rx for single line string
                    rx = r'(?P<qu>%s)' % \
                        Tokenizer.quotes
                token, gd = self._parseRx(rx)
                if gd:
                    start, end = token.span('qu')
                    if start > 2 and \
                        self.parsed[start-1] == '\\' and \
                        self.parsed[start-2] != '\\':
                        self.tokens.add('txt',
                            self.parsed[:end])
                        self.parsed = self.parsed[end:]
                    else:
                        self.tokens.add('txt',
                            self.parsed[:start])
                        self.tokens.add('qu', gd['qu'])
                        Tokenizer.quotes = None
                        self.parsed = self.parsed[end:]
                    continue
                elif len(Tokenizer.quotes) == 3:
                    # a multiline string is legal
                    # add everything as text
                    self.tokens.add('txt', self.parsed)
                else:
                    # singleline string not closed
                    # pass everything as '??'
                    self.tokens.add('??', self.parsed)
                    # process next line correctly
                    Tokenizer.quotes = None
                break

            # are qe opening a string now?
            if self._parseStringStart():
                self.tokens.add('qu', Tokenizer.quotes)
                continue

            # we're not parsing a string
            token, gd = self._parseRx(Tokenizer.rxToken)
            if not gd:
                self.tokens.add('??', self.parsed)
                break

            for key in gd:
                value = gd[key]
                if not value:
                    continue
                elif value in KEYWORDS:
                    self.tokens.add('kw', value)
                else:
                    self.tokens.add(key, value)
            self.parsed = self.parsed[token.end():]

    def parse(self, line):
        # reset the object
        self._init(line)
        # get list of tokens
        self._getTokens()
        # merge like tokens
        retval, currentKey, currentList = [], '', []
        for key, value in self.tokens:
            if key == currentKey:
                currentList.append(value)
            else:
                retval.append( (currentKey, currentList) )
                currentKey, currentList = key, [value]
        if len(currentList):
            retval.append( (currentKey, currentList) )
        self.tokens = retval
        return self.tokens

def generateFormattedFile(source, destination):
    '''Generate a formatted HTML block
    from the source code.

    '''

    dest = open(destination, 'wt')
    dest.write('''<div style="background-color:%s;">
        <pre>''' % COLORS['bg'])

    tokenizer = Tokenizer()

    NEWLINE = '\n'

    for line in open(source).readlines():
        line = line.rstrip()
        if not line:
            dest.write(NEWLINE)
            continue
        formattedLine = ''
        for name, list in tokenizer.parse(line):
            if name in COLORS:
                formattedLine += \
                    '<span style="color:%s;">' % \
                        COLORS[name] + \
                    ''.join(list) + '</span>'
            else:
                formattedLine += ''.join(list)
        dest.write(formattedLine + NEWLINE)
    dest.write('''
        </pre>
    </div>''')

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Syntax: %s <sourcefile>' % sys.argv[0])
    else:
        generateFormattedFile(sys.argv[1], sys.argv[1] + '.html')