[svn r37594] Fixed unicode issues in apigen/htmlgen.py and apigen/source, moved some methods

out of HTMLDocument and added support for finding out the encoding of a Python file in order to accomplish that (both in source/html.py), fixed some minor issues (items with a name starting with _ are now hidden from nav, fixed indentation issue in nav) in htmlgen.py. --HG-- branch : trunk
2007-01-30 14:24:27 +01:00
parent c2e1510f86
commit 45868fe506
5 changed files with 149 additions and 81 deletions
--- a/py/apigen/source/html.py
+++ b/py/apigen/source/html.py
@@ -2,14 +2,13 @@
 """ html - generating ad-hoc html out of source browser
 """

+import py
 from py.xml import html, raw
 from compiler import ast
 import time
 from py.__.apigen.source.color import Tokenizer, PythonSchema

 class HtmlEnchanter(object):
-    reserved_words = ['if', 'for', 'return', 'yield']
-
    def __init__(self, mod):
        self.mod = mod
        self.create_caches()
@@ -37,8 +36,30 @@ class HtmlEnchanter(object):
        except KeyError:
            return [row] # no more info

+def prepare_line(text, tokenizer, encoding):
+    """ adds html formatting to text items (list)
+
+        only processes items if they're of a string type (or unicode)
+    """
+    ret = []
+    for item in text:
+        if type(item) in [str, unicode]:
+            tokens = tokenizer.tokenize(item)
+            for t in tokens:
+                data = unicode(t.data, encoding)
+                if t.type in ['keyword', 'alt_keyword', 'number',
+                              'string', 'comment']:
+                    ret.append(html.span(data, class_=t.type))
+                else:
+                    ret.append(data)
+        else:
+            ret.append(item)
+    return ret
+
 class HTMLDocument(object):
-    def __init__(self, tokenizer=None):
+    def __init__(self, encoding, tokenizer=None):
+        self.encoding = encoding
+
        self.html = root = html.html()
        self.head = head = self.create_head()
        root.append(head)
@@ -119,30 +140,11 @@ class HTMLDocument(object):
        table.append(tbody)
        return table, tbody

-    def prepare_line(self, text):
-        """ adds html formatting to text items (list)
-
-            only processes items if they're of a string type (or unicode)
-        """
-        ret = []
-        for item in text:
-            if type(item) in [str, unicode]:
-                tokens = self.tokenizer.tokenize(item)
-                for t in tokens:
-                    if t.type in ['keyword', 'alt_keyword', 'number',
-                                  'string', 'comment']:
-                        ret.append(html.span(t.data, class_=t.type))
-                    else:
-                        ret.append(t.data)
-            else:
-                ret.append(item)
-        return ret
-
    def add_row(self, lineno, text):
        if text == ['']:
            text = [raw('&#xa0;')]
        else:
-            text = self.prepare_line(text)
+            text = prepare_line(text, self.tokenizer, self.encoding)
        self.tbody.append(html.tr(html.td(str(lineno), class_='lineno'),
                                  html.td(class_='code', *text)))

@@ -157,7 +159,8 @@ def create_html(mod):
    lines = mod.path.open().readlines()
    
    enchanter = HtmlEnchanter(mod)
-    doc = HTMLDocument()
+    enc = get_module_encoding(mod.path)
+    doc = HTMLDocument(enc)
    for i, row in enumerate(lines):
        row = enchanter.enchant_row(i + 1, row)
        doc.add_row(i + 1, row)
@@ -248,3 +251,16 @@ def create_unknown_html(path):
    )
    return h.unicode()

+_reg_enc = py.std.re.compile(r'coding[:=]\s*([-\w.]+)')
+def get_module_encoding(path):
+    if hasattr(path, 'strpath'):
+        path = path.strpath
+    if path[-1] in ['c', 'o']:
+        path = path[:-1]
+    fpath = py.path.local(path)
+    code = fpath.read()
+    match = _reg_enc.search(code)
+    if match:
+        return match.group(1)
+    return 'ISO-8859-1'
+