# -*- coding: utf-8 -*-
__doc__ = """\
Module for minification functions.
"""
# Import built-in modules
import re, tokenize
try:
import cStringIO as io
except ImportError: # We're using Python 3
import io
# Import our own modules
from . import analyze, token_utils
# Compile our regular expressions for speed
multiline_quoted_string = re.compile(r'(\'\'\'|\"\"\")')
not_quoted_string = re.compile(r'(\".*\'\'\'.*\"|\'.*\"\"\".*\')')
trailing_newlines = re.compile(r'\n\n')
multiline_indicator = re.compile('\\\\(\s*#.*)?\n')
left_of_equals = re.compile('^.*?=')
# The above also removes trailing comments: "test = 'blah \ # comment here"
# These aren't used but they're a pretty good reference:
double_quoted_string = re.compile(r'((?<!\\)".*?(?<!\\)")')
single_quoted_string = re.compile(r"((?<!\\)'.*?(?<!\\)')")
single_line_single_quoted_string = re.compile(r"((?<!\\)'''.*?(?<!\\)''')")
single_line_double_quoted_string = re.compile(r"((?<!\\)'''.*?(?<!\\)''')")
[docs]def remove_docstrings(tokens):
"""
Removes docstrings from *tokens* which is expected to be a list equivalent
of `tokenize.generate_tokens()` (so we can update in-place).
"""
prev_tok_type = None
for index, tok in enumerate(tokens):
token_type = tok[0]
if token_type == tokenize.STRING:
if prev_tok_type == tokenize.INDENT:
# Definitely a docstring
tokens[index][1] = '' # Remove it
# Remove the leftover indentation and newline:
tokens[index-1][1] = ''
tokens[index-2][1] = ''
elif prev_tok_type == tokenize.NL:
# This captures whole-module docstrings:
if tokens[index+1][0] == tokenize.NEWLINE:
tokens[index][1] = ''
# Remove the trailing newline:
tokens[index+1][1] = ''
prev_tok_type = token_type
[docs]def reduce_operators(source):
"""
Remove spaces between operators in *source* and returns the result.
Example::
def foo(foo, bar, blah):
test = "This is a %s" % foo
Will become::
def foo(foo,bar,blah):
test="This is a %s"%foo
"""
io_obj = io.StringIO(source)
remove_columns = []
out = ""
out_line = ""
prev_toktype = tokenize.INDENT
prev_tok = None
last_lineno = -1
last_col = 0
lshift = 1
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out_line += (" " * (start_col - last_col))
if token_type == tokenize.OP:
# Operators that begin a line such as @ or open parens should be
# left alone:
start_of_line_types = [ # These indicate we're starting a new line
tokenize.NEWLINE, tokenize.DEDENT, tokenize.INDENT]
prev_tok_string = prev_token[1]
if prev_toktype not in start_of_line_types:
if token_string == '.' and prev_tok_string != 'from':
# This is just a regular operator; remove spaces
remove_columns.append(start_col) # Before OP
remove_columns.append(end_col+1) # After OP
else:
remove_columns.append(start_col) # Before OP
remove_columns.append(end_col+1) # After OP
if token_string.endswith('\n'):
out_line += token_string
if remove_columns:
for col in remove_columns:
col = col - lshift
try:
# This was really handy for debugging (looks nice, worth saving):
#print(out_line + (" " * col) + "^")
# The above points to the character we're looking at
if out_line[col] == " ": # Only if it is a space
out_line = out_line[:col] + out_line[col+1:]
lshift += 1 # Re-align future changes on this line
except IndexError: # Reached and end of line, no biggie
pass
out += out_line
remove_columns = []
out_line = ""
lshift = 1
else:
out_line += token_string
prev_toktype = token_type
prev_token = tok
last_col = end_col
last_lineno = end_line
# This makes sure to capture the last line if it doesn't end in a newline:
out += out_line
# The tokenize module doesn't recognize @ sign before a decorator
return out
[docs]def join_multiline_pairs(text, pair="()"):
"""
Finds and removes newlines in multiline matching pairs of characters in
*text*.
By default it joins parens () but it will join any two characters given via
the *pair* variable.
.. note::
Doesn't remove extraneous whitespace that ends up between the pair.
Use `reduce_operators()` for that.
Example::
test = (
"This is inside a multi-line pair of parentheses"
)
Will become::
test = ( "This is inside a multi-line pair of parentheses" )
"""
# Readability variables
opener = pair[0]
closer = pair[1]
# Tracking variables
inside_pair = False
inside_quotes = False
inside_double_quotes = False
inside_single_quotes = False
quoted_string = False
openers = 0
closers = 0
linecount = 0
# Regular expressions
opener_regex = re.compile('\%s' % opener)
closer_regex = re.compile('\%s' % closer)
output = ""
for line in text.split('\n'):
escaped = False
# First we rule out multi-line strings
multline_match = multiline_quoted_string.search(line)
not_quoted_string_match = not_quoted_string.search(line)
if multline_match and not not_quoted_string_match and not quoted_string:
if len(line.split('"""')) > 1 or len(line.split("'''")):
# This is a single line that uses the triple quotes twice
# Treat it as if it were just a regular line:
output += line + '\n'
quoted_string = False
else:
output += line + '\n'
quoted_string = True
elif quoted_string and multiline_quoted_string.search(line):
output += line + '\n'
quoted_string = False
# Now let's focus on the lines containing our opener and/or closer:
elif not quoted_string:
if opener_regex.search(line) or closer_regex.search(line) or inside_pair:
for character in line:
if character == opener:
if not escaped and not inside_quotes:
openers += 1
inside_pair = True
output += character
else:
escaped = False
output += character
elif character == closer:
if not escaped and not inside_quotes:
if openers and openers == (closers + 1):
closers = 0
openers = 0
inside_pair = False
output += character
else:
closers += 1
output += character
else:
escaped = False
output += character
elif character == '\\':
if escaped:
escaped = False
output += character
else:
escaped = True
output += character
elif character == '"' and escaped:
output += character
escaped = False
elif character == "'" and escaped:
output += character
escaped = False
elif character == '"' and inside_quotes:
if inside_single_quotes:
output += character
else:
inside_quotes = False
inside_double_quotes = False
output += character
elif character == "'" and inside_quotes:
if inside_double_quotes:
output += character
else:
inside_quotes = False
inside_single_quotes = False
output += character
elif character == '"' and not inside_quotes:
inside_quotes = True
inside_double_quotes = True
output += character
elif character == "'" and not inside_quotes:
inside_quotes = True
inside_single_quotes = True
output += character
elif character == ' ' and inside_pair and not inside_quotes:
if not output[-1] in [' ', opener]:
output += ' '
else:
if escaped:
escaped = False
output += character
if inside_pair == False:
output += '\n'
else:
output += line + '\n'
else:
output += line + '\n'
# Clean up
output = trailing_newlines.sub('\n', output)
return output
[docs]def dedent(source, use_tabs=False):
"""
Minimizes indentation to save precious bytes. Optionally, *use_tabs*
may be specified if you want to use tabulators (\t) instead of spaces.
Example::
def foo(bar):
test = "This is a test"
Will become::
def foo(bar):
test = "This is a test"
"""
if use_tabs:
indent_char = '\t'
else:
indent_char = ' '
io_obj = io.StringIO(source)
out = ""
last_lineno = -1
last_col = 0
prev_start_line = 0
indentation = ""
indentation_level = 0
for i, tok in enumerate(tokenize.generate_tokens(io_obj.readline)):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
if start_line > last_lineno:
last_col = 0
if token_type == tokenize.INDENT:
indentation_level += 1
continue
if token_type == tokenize.DEDENT:
indentation_level -= 1
continue
indentation = indent_char * indentation_level
if start_line > prev_start_line:
out += indentation + str(token_string)
elif start_col > last_col:
out += indent_char + str(token_string)
else:
out += token_string
prev_start_line = start_line
last_col = end_col
last_lineno = end_line
return out
# TODO: Rewrite this to use tokens
[docs]def fix_empty_methods(source):
"""
Appends 'pass' to empty methods/functions (i.e. where there was nothing but
a docstring before we removed it =).
Example::
# Note: This triple-single-quote inside a triple-double-quote is also a
# pyminifier self-test
def myfunc():
'''This is just a placeholder function.'''
Will become::
def myfunc(): pass
"""
def_indentation_level = 0
output = ""
just_matched = False
previous_line = None
method = re.compile(r'^\s*def\s*.*\(.*\):.*$')
for line in source.split('\n'):
if len(line.strip()) > 0: # Don't look at blank lines
if just_matched == True:
this_indentation_level = len(line.rstrip()) - len(line.strip())
if def_indentation_level == this_indentation_level:
# This method is empty, insert a 'pass' statement
indent = " " * (def_indentation_level + 1)
output += "%s\n%spass\n%s\n" % (previous_line, indent, line)
else:
output += "%s\n%s\n" % (previous_line, line)
just_matched = False
elif method.match(line):
def_indentation_level = len(line) - len(line.strip())
just_matched = True
previous_line = line
else:
output += "%s\n" % line # Another self-test
else:
output += "\n"
return output
[docs]def remove_blank_lines(source):
"""
Removes blank lines from *source* and returns the result.
Example:
.. code-block:: python
test = "foo"
test2 = "bar"
Will become:
.. code-block:: python
test = "foo"
test2 = "bar"
"""
io_obj = io.StringIO(source)
source = [a for a in io_obj.readlines() if a.strip()]
return "".join(source)
[docs]def minify(tokens, options):
"""
Performs minification on *tokens* according to the values in *options*
"""
# Remove comments
remove_comments(tokens)
# Remove docstrings
remove_docstrings(tokens)
result = token_utils.untokenize(tokens)
# Minify our input script
result = multiline_indicator.sub('', result)
result = fix_empty_methods(result)
result = join_multiline_pairs(result)
result = join_multiline_pairs(result, '[]')
result = join_multiline_pairs(result, '{}')
result = remove_blank_lines(result)
result = reduce_operators(result)
result = source = dedent(result, use_tabs=options.tabs)
return result