# -*- coding: utf-8 -*- import re import sys PY3 = sys.version_info[0] == 3 if PY3: from html.entities import name2codepoint unicode = str unichr = chr else: from htmlentitydefs import name2codepoint def safe_decode(s, encoding='utf-8', errors='strict'): if isinstance(s, unicode): return s return s.decode(encoding) class WebIntelligentToHtmlConverter(object): urlRegexp = re.compile( r'((?:ftp|https?)://(localhost|([12]?[0-9]{1,2}.){3}([12]?[0-9]{1,2})|(?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?:com|edu|biz|org|gov|int|info|mil|net|name|museum|coop|aero|[a-z][a-z]))\b(?::\d+)?(?:\/[^"\'<>()\[\]{}\s\x7f-\xff]*(?:[.,?]+[^"\'<>()\[\]{}\s\x7f-\xff]+)*)?)', re.I | re.S | re.U ) emailRegexp = re.compile( r'["=]?(\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)', re.I | re.S | re.U ) indentRegexp = re.compile(r'^(\s+)', re.M | re.U) def __init__(self, orig, tab_width=4): self.orig = orig self.tab_width = tab_width def __call__(self): text = self.orig if text is None: text = '' text = safe_decode(text, errors='replace') # Do & separately, else, it may replace an already-inserted & from # an entity with &, so < becomes < becomes < text = text.replace('&', '&') # Make funny characters into html entity defs for entity, codepoint in name2codepoint.items(): if entity != 'amp': text = text.replace(unichr(codepoint), '&' + entity + ';') text = self.urlRegexp.subn(self.replaceURL, text)[0] text = self.emailRegexp.subn(self.replaceEmail, text)[0] text = self.indentRegexp.subn(self.indentWhitespace, text)[0] # convert windows line endings text = text.replace('\r\n', '\n') # Finally, make \n's into br's text = text.replace('\n', '
') text = text.encode('utf-8') return text @staticmethod def abbreviateUrl(url, max=60, ellipsis="[…]"): """very long urls are abbreviated to allow nicer layout """ if len(url) < max: return url protocol = "" protocolend = url.find("//") if protocolend != -1: protocol = url[0:protocolend+2] url = url[protocolend+2:] list = url.split("/") if len(list) < 3 or len(list[0]) + len(list[-1]) > max: url = protocol + url center = (max-5) // 2 return url[:center] + ellipsis + url[-center:] return protocol + list[0] + "/" + ellipsis + "/" + list[-1] @classmethod def replaceURL(cls, match): """Replace hyperlinks with clickable tags """ url = match.groups()[0] linktext = cls.abbreviateUrl(url) # Also with we should only link to some link, not # including the brackets. end = '' # XXX Probably better to fix the regex above. Maurits if url.endswith('>'): url = url[:-len('>')] linktext = linktext[:-len('>')] end = '>' # rel="nofollow" shall avoid spamming return '%s%s' % (url, linktext, end) @staticmethod def replaceEmail(match): """Replace email strings with mailto: links """ url = match.groups()[0] # following unicode substitutions shall avoid email spam # crawlers to pickup email addresses url = url.replace('@', '@') return '%s' % (url, url) def indentWhitespace(self, match): """Make leading whitespace on a line into to preserve indents """ indent = match.groups()[0] indent = indent.replace(' ', ' ') return indent.replace('\t', ' ' * self.tab_width) def convertWebIntelligentPlainTextToHtml(orig, tab_width=4): """Converts text/x-web-intelligent to text/html """ try: # tab_width could be a string like '4' tab_width = int(tab_width) except ValueError: tab_width = 4 return WebIntelligentToHtmlConverter(orig, tab_width)() def convertHtmlToWebIntelligentPlainText(orig): """Converts text/html to text/x-web-intelligent. """ preRegex = re.compile(r'<\s*pre[^>]*>(.*?)<\s*/pre\s*>', re.I | re.S) tagWhitespaceRegex = re.compile(r'\s+((<[^>]+>)\s+)+') whitespaceRegex = re.compile(r'\s+') tdRegex = re.compile(r'<\s*(td)([^>])*>', re.I) breakRegex = re.compile(r'<\s*(br)\s*/?>', re.I) startBlockRegex = re.compile(r'<\s*(dt)[^>]*>', re.I) endBlockRegex = re.compile(r'<\s*/\s*(p|div|tr|ul|ol|dl)[^>]*>', re.I) indentBlockRegex = re.compile(r'<\s*(blockquote|dd)[^>]*>', re.I) listBlockRegex = re.compile(r'<\s*(li)[^>]*>', re.I) tagRegex = re.compile(r'<[^>]+>', re.I | re.M) # Save all

 sections and restore after other transforms
    preSections = {}

    def savePres(match):
        marker = '__pre_marker__%d__' % len(preSections)
        preSections[marker] = match.group(1)
        return marker
    if orig is None:
        orig = ''
    text = preRegex.sub(savePres, orig)

    def fixTagWhitespace(match):
        """Make whitespace-tag-whitespace into whitespace-tag.
        Repeat this in case there are directly nested tags.
        """
        # Remove any superfluous whitespace, but preserve one leading space
        return ' ' + whitespaceRegex.sub('', match.group(0))
    text = tagWhitespaceRegex.sub(fixTagWhitespace, text)

    # Make all whitespace into a single space
    text = whitespaceRegex.sub(' ', text)

    # Fix entities
    text = text.replace(' ', ' ')
    for entity, codepoint in name2codepoint.items():
        # Do < and > later, else we may be creating what looks like
        # tags
        if entity != 'lt' and entity != 'gt' and entity != 'amp':
            text = text.replace(
                '&' + entity + ';',
                '&#' + str(codepoint) + ';'
            )

    # XXX: Remove ,