[svn r38386] Fixed some more nasty problems after running py.test --apigen on pypy: problem
recognizing """foo "bar" baz""" as a single line string (hack unfortunately), unicode problems. Also added some code to debug the tokenizer: you can now run the color.py script with a Python file as arg to tokenize the file. --HG-- branch : trunk
This commit is contained in:
		
							parent
							
								
									e11e7472bc
								
							
						
					
					
						commit
						ada3a4cfcf
					
				|  | @ -44,10 +44,11 @@ class Tokenizer(object): | ||||||
|         very naive tokenizer, state is recorded for multi-line strings, etc. |         very naive tokenizer, state is recorded for multi-line strings, etc. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     _re_word = re.compile('[\w_]+') |     _re_word = re.compile('[\w_]+', re.U) | ||||||
|     _re_space = re.compile('\s+') |     _re_space = re.compile('\s+', re.U) | ||||||
|     _re_number = re.compile('[\d\.]*\d[\d\.]*l?', re.I) |     _re_number = re.compile('[\d\.]*\d[\d\.]*l?', re.I | re.U) | ||||||
|     _re_rest = re.compile('[^\w\s\d\'"]+') # XXX cheating a bit with the quotes |     # XXX cheating a bit with the quotes | ||||||
|  |     _re_rest = re.compile('[^\w\s\d\'"]+', re.U) | ||||||
| 
 | 
 | ||||||
|     # these will be filled using the schema |     # these will be filled using the schema | ||||||
|     _re_strings_full = None |     _re_strings_full = None | ||||||
|  | @ -68,13 +69,14 @@ class Tokenizer(object): | ||||||
|                 re.compile(r'%s[^\\%s]+(\\.[^\\%s]*)*%s' % (d, d, d, d))) |                 re.compile(r'%s[^\\%s]+(\\.[^\\%s]*)*%s' % (d, d, d, d))) | ||||||
|             self._re_strings_empty.append(re.compile('%s%s' % (d, d))) |             self._re_strings_empty.append(re.compile('%s%s' % (d, d))) | ||||||
|         for d in schema.multiline_string: |         for d in schema.multiline_string: | ||||||
|             self._re_strings_multiline.append((re.compile('%s.*' % (d,), re.S), |             self._re_strings_multiline.append((re.compile('(%s).*' % (d,), | ||||||
|  |                                                           re.S), | ||||||
|                                                re.compile('.*?%s' % (d,)))) |                                                re.compile('.*?%s' % (d,)))) | ||||||
|         if schema.linejoin: |         if schema.linejoin: | ||||||
|             j = schema.linejoin |             j = schema.linejoin | ||||||
|             for d in schema.string: |             for d in schema.string: | ||||||
|                 self._re_strings_multiline.append( |                 self._re_strings_multiline.append( | ||||||
|                     (re.compile('%s.*%s$' % (d, j)), |                     (re.compile('(%s).*%s$' % (d, j)), | ||||||
|                      re.compile('.*?%s' % (d,)))) |                      re.compile('.*?%s' % (d,)))) | ||||||
|         # no multi-line comments in Python... phew :) |         # no multi-line comments in Python... phew :) | ||||||
|         self._re_comments = [] |         self._re_comments = [] | ||||||
|  | @ -123,6 +125,14 @@ class Tokenizer(object): | ||||||
|             if m: |             if m: | ||||||
|                 s = m.group(0) |                 s = m.group(0) | ||||||
|                 data = '' |                 data = '' | ||||||
|  |                 # XXX take care of a problem which is hard to fix with regexps: | ||||||
|  |                 # '''foo 'bar' baz''' will not match single-line strings | ||||||
|  |                 # (because [^"""] matches just a single " already), so let's | ||||||
|  |                 # try to catch it here... (quite Python specific issue!) | ||||||
|  |                 endm = end.match(s[len(m.group(1)):]) | ||||||
|  |                 if endm: # see if it ends here already | ||||||
|  |                     s = m.group(1) + endm.group(0) | ||||||
|  |                 else: | ||||||
|                     self._inside_multiline = end |                     self._inside_multiline = end | ||||||
|                 token = Token(s, 'string') |                 token = Token(s, 'string') | ||||||
|                 break |                 break | ||||||
|  | @ -184,4 +194,18 @@ class Tokenizer(object): | ||||||
|             return data[len(s):], Token(s, 'unknown') |             return data[len(s):], Token(s, 'unknown') | ||||||
|         return data, None |         return data, None | ||||||
| 
 | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     import py, sys | ||||||
|  |     if len(sys.argv) != 2: | ||||||
|  |         print 'usage: %s <filename>' | ||||||
|  |         print '  tokenizes the file and prints the tokens per line' | ||||||
|  |         sys.exit(1) | ||||||
|  |     t = Tokenizer(PythonSchema) | ||||||
|  |     p = py.path.local(sys.argv[1]) | ||||||
|  |     assert p.ext == '.py' | ||||||
|  |     for line in p.read().split('\n'): | ||||||
|  |         print repr(line) | ||||||
|  |         print 't in multiline mode:', not not t._inside_multiline | ||||||
|  |         tokens = t.tokenize(line) | ||||||
|  |         print list(tokens) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -90,3 +90,8 @@ class TestTokenizer(object): | ||||||
|         assert self.tokens('."foo"') == [Token('.', type='unknown'), |         assert self.tokens('."foo"') == [Token('.', type='unknown'), | ||||||
|                                          Token('"foo"', type='string')] |                                          Token('"foo"', type='string')] | ||||||
| 
 | 
 | ||||||
|  |     def test_something_strange(self): | ||||||
|  |         t = Tokenizer(PythonSchema) | ||||||
|  |         tokens = list(t.tokenize('"""foo "bar" baz"""')) | ||||||
|  |         assert not t._inside_multiline | ||||||
|  | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue