Wednesday, August 25
Sunday, August 8
A Python Syntax Highlighter for HTML, Written in Python
As of late, I wanted to post some python code on my blog and the formatting options I found kind of sucked (actually I was getting bored and looked for excuses to roll my own).
I had a look at some options, but the most known, either had to be installed locally (and I didn't want to do that), or they worked online, but the generated code was incomplete or the formatting sucked (in my self-absorbed opinion).
Long story short, I wrote my own, in Python.
The formater generates HTML code separating operators, keywords, tokens (identifiers, variables and numbers), comments and text.
The colors for these can be set in the COLORS global map.
Here's the code (the formatter err ... formatted itself for this post, as a test):
I had a look at some options, but the most known, either had to be installed locally (and I didn't want to do that), or they worked online, but the generated code was incomplete or the formatting sucked (in my self-absorbed opinion).
Long story short, I wrote my own, in Python.
The formater generates HTML code separating operators, keywords, tokens (identifiers, variables and numbers), comments and text.
The colors for these can be set in the COLORS global map.
Here's the code (the formatter err ... formatted itself for this post, as a test):
import sys import re # alphanumeric character escaped # for regular expressions ALPHANUMS = 'a-zA-Z0-9_' # python operators, escaped # for regular expressions OPERATORS = r'\.\(\)\[\]\{\}\:\'\"\!=,%\+\-\*\/\^\&<>' # python keywords KEYWORDS = ['if', 'for', 'while', 'do', 'def', 'class', 'None', 'True', 'False', 'and', 'or', 'not', 'import', 'else', 'elif', 'raise', 'except', 'break', 'continue', 'lambda', 'return', 'yield', 'global'] COLORS = { 'bg':'#030303', 'kw':'blue', 'tk':'silver', 'op':'teal', 'cm':'green', 'txt':'red', 'qu':'darkred', '??':'white'} # how many spaces should a tab character be: TAB_LENGTH = 4 escape = lambda source, replacements: \ ''.join( [ c if c not in replacements \ else replacements[c] \ for c in source ] ) # escape a string so it will be parsable # by the re module def rxEscaped(source): replacements = { r'[':r'\[', r']':r'\]', r'{':r'\{', r'}':r'\}'} return escape(source, replacements) SPACES = ' \t' SPLITTERS = rxEscaped( OPERATORS + SPACES ) class TokensList(list): replacements = { '<':'<', '>':'>', '&':'&', ' ':' ', '\t':' '*TAB_LENGTH} def __init__(self): list.__init__(self) def add(self, key, value): self.append( (key, \ escape(value, TokensList.replacements)) ) class Tokenizer(object): '''Transforms a line in a group of tokens, where each token is represented by a pair: line -> [ (key, token), (key, token), ... ] The key represents the type of the token: kw = keyword tk = word sp = spacing op = operator cm = comment qu = quotes txt = string contents ?? = unidentified (for any errors) ''' rxToken = re.compile(r''' ^(?P<tk>[%s]+) # string token |(?P<sp>[%s]{1}) # or space |(?P<op>[%s]{1}) # or operators |(?P<cm>\#.*$) # or comment ''' % (ALPHANUMS, SPACES, OPERATORS), \ re.VERBOSE) quotes = None def __init__(self): self._init() def _init(self, line=''): self.line, self.parsed = line, line self.tokens = TokensList() def _parseRx(self, rx): '''Search for rx against line and return the found match and the groups dict. ''' found = re.search(rx, self.parsed) if found: return (found, found.groupdict()) else: return (None, None) def _parseStringStart(self): quotes = \ self._parseRx(r'^(?P<qu>[\']{3})')[1] or \ self._parseRx(r'^(?P<qu>[\"]{3})')[1] or \ self._parseRx(r'^(?P<qu>\')')[1] or \ self._parseRx(r'^(?P<qu>\")')[1] if quotes: quotes = quotes['qu'] self.parsed = self.parsed[len(quotes):] Tokenizer.quotes = quotes return True return False def _getTokens(self): while self.parsed: ## are we in a string? if Tokenizer.quotes: if len(Tokenizer.quotes) == 3: # rx for multiline string rx = '(?P<qu>%s)' % \ (('\\' + Tokenizer.quotes[0])*3) else: # rx for single line string rx = r'(?P<qu>%s)' % \ Tokenizer.quotes token, gd = self._parseRx(rx) if gd: start, end = token.span('qu') if start > 2 and \ self.parsed[start-1] == '\\' and \ self.parsed[start-2] != '\\': self.tokens.add('txt', self.parsed[:end]) self.parsed = self.parsed[end:] else: self.tokens.add('txt', self.parsed[:start]) self.tokens.add('qu', gd['qu']) Tokenizer.quotes = None self.parsed = self.parsed[end:] continue elif len(Tokenizer.quotes) == 3: # a multiline string is legal # add everything as text self.tokens.add('txt', self.parsed) else: # singleline string not closed # pass everything as '??' self.tokens.add('??', self.parsed) # process next line correctly Tokenizer.quotes = None break # are qe opening a string now? if self._parseStringStart(): self.tokens.add('qu', Tokenizer.quotes) continue # we're not parsing a string token, gd = self._parseRx(Tokenizer.rxToken) if not gd: self.tokens.add('??', self.parsed) break for key in gd: value = gd[key] if not value: continue elif value in KEYWORDS: self.tokens.add('kw', value) else: self.tokens.add(key, value) self.parsed = self.parsed[token.end():] def parse(self, line): # reset the object self._init(line) # get list of tokens self._getTokens() # merge like tokens retval, currentKey, currentList = [], '', [] for key, value in self.tokens: if key == currentKey: currentList.append(value) else: retval.append( (currentKey, currentList) ) currentKey, currentList = key, [value] if len(currentList): retval.append( (currentKey, currentList) ) self.tokens = retval return self.tokens def generateFormattedFile(source, destination): '''Generate a formatted HTML block from the source code. ''' dest = open(destination, 'wt') dest.write('''<div style="background-color:%s;"> <pre>''' % COLORS['bg']) tokenizer = Tokenizer() NEWLINE = '\n' for line in open(source).readlines(): line = line.rstrip() if not line: dest.write(NEWLINE) continue formattedLine = '' for name, list in tokenizer.parse(line): if name in COLORS: formattedLine += \ '<span style="color:%s;">' % \ COLORS[name] + \ ''.join(list) + '</span>' else: formattedLine += ''.join(list) dest.write(formattedLine + NEWLINE) dest.write(''' </pre> </div>''') if __name__ == '__main__': if len(sys.argv) < 2: print('Syntax: %s <sourcefile>' % sys.argv[0]) else: generateFormattedFile(sys.argv[1], sys.argv[1] + '.html')
Subscribe to:
Posts (Atom)