Source code for pyminifier.analyze

# -*- coding: utf-8 -*-

__doc__ = """\
A module of useful functions for analyzing Python code.
"""

# Import builtins
import os, sys, re, tokenize, keyword
try:
    import cStringIO as io
except ImportError: # Ahh, Python 3
    import io

# Globals
py3 = False
if sys.version_info.major == 3:
    py3 = True
shebang = re.compile('^#\!.*$')
encoding = re.compile(".*coding[:=]\s*([-\w.]+)")
# __builtins__ is different for every module so we need a hard-coded list:
builtins = [
    'ArithmeticError',
    'AssertionError',
    'AttributeError',
    'BaseException',
    'BufferError',
    'BytesWarning',
    'DeprecationWarning',
    'EOFError',
    'Ellipsis',
    'EnvironmentError',
    'Exception',
    'False',
    'FloatingPointError',
    'FutureWarning',
    'GeneratorExit',
    'IOError',
    'ImportError',
    'ImportWarning',
    'IndentationError',
    'IndexError',
    'KeyError',
    'KeyboardInterrupt',
    'LookupError',
    'MemoryError',
    'NameError',
    'None',
    'NotImplemented',
    'NotImplementedError',
    'OSError',
    'OverflowError',
    'PendingDeprecationWarning',
    'ReferenceError',
    'RuntimeError',
    'RuntimeWarning',
    'StandardError',
    'StopIteration',
    'SyntaxError',
    'SyntaxWarning',
    'SystemError',
    'SystemExit',
    'TabError',
    'True',
    'TypeError',
    'UnboundLocalError',
    'UnicodeDecodeError',
    'UnicodeEncodeError',
    'UnicodeError',
    'UnicodeTranslateError',
    'UnicodeWarning',
    'UserWarning',
    'ValueError',
    'Warning',
    'ZeroDivisionError',
    '__IPYTHON__',
    '__IPYTHON__active',
    '__debug__',
    '__doc__',
    '__import__',
    '__name__',
    '__package__',
    'abs',
    'all',
    'any',
    'apply',
    'basestring',
    'bin',
    'bool',
    'buffer',
    'bytearray',
    'bytes',
    'callable',
    'chr',
    'classmethod',
    'cmp',
    'coerce',
    'compile',
    'complex',
    'copyright',
    'credits',
    'delattr',
    'dict',
    'dir',
    'divmod',
    'dreload',
    'enumerate',
    'eval',
    'execfile',
    'exit',
    'file',
    'filter',
    'float',
    'format',
    'frozenset',
    'getattr',
    'globals',
    'hasattr',
    'hash',
    'help',
    'hex',
    'id',
    'input',
    'int',
    'intern',
    'ip_set_hook',
    'ipalias',
    'ipmagic',
    'ipsystem',
    'isinstance',
    'issubclass',
    'iter',
    'jobs',
    'len',
    'license',
    'list',
    'locals',
    'long',
    'map',
    'max',
    'min',
    'next',
    'object',
    'oct',
    'open',
    'ord',
    'pow',
    'print',
    'property',
    'quit',
    'range',
    'raw_input',
    'reduce',
    'reload',
    'repr',
    'reversed',
    'round',
    'set',
    'setattr',
    'slice',
    'sorted',
    'staticmethod',
    'str',
    'sum',
    'super',
    'tuple',
    'type',
    'unichr',
    'unicode',
    'vars',
    'xrange',
    'zip'
]

reserved_words = keyword.kwlist + builtins

[docs]def enumerate_keyword_args(tokens):
    """
    Iterates over *tokens* and returns a dictionary with function names as the
    keys and lists of keyword arguments as the values.
    """
    keyword_args = {}
    inside_function = False
    for index, tok in enumerate(tokens):
        token_type = tok[0]
        token_string = tok[1]
        if token_type == tokenize.NEWLINE:
            inside_function = False
        if token_type == tokenize.NAME:
            if token_string == "def":
                function_name = tokens[index+1][1]
                inside_function = function_name
                keyword_args.update({function_name: []})
            elif inside_function:
                if tokens[index+1][1] == '=': # keyword argument
                    keyword_args[function_name].append(token_string)
    return keyword_args

[docs]def enumerate_imports(tokens):
    """
    Iterates over *tokens* and returns a list of all imported modules.

    **Note:** This is intelligent about the use of the 'as' keyword.
    """
    imported_modules = []
    import_line = False
    for index, tok in enumerate(tokens):
        token_type = tok[0]
        token_string = tok[1]
        if token_type == tokenize.NEWLINE:
            import_line = False
        elif token_string == "import":
            import_line = True
        elif import_line:
            if token_type == tokenize.NAME and tokens[index+1][1] != 'as':
                if token_string not in reserved_words:
                    if token_string not in imported_modules:
                        imported_modules.append(token_string)
    return imported_modules

[docs]def enumerate_global_imports(tokens):
    """
    Returns a list of all globally imported modules (skips modules imported
    inside of classes, methods, or functions).

    Example:
        >>> enumerate_global_modules(tokens)
        ['sys', 'os', 'tokenize', 're']
    """
    imported_modules = []
    import_line = False
    parent_module = ""
    function_count = 0
    indentation = 0
    for index, tok in enumerate(tokens):
        token_type = tok[0]
        token_string = tok[1]
        if token_type == tokenize.INDENT:
            indentation += 1
        elif token_type == tokenize.DEDENT:
            indentation -= 1
        elif token_type == tokenize.NEWLINE:
            import_line = False
        elif token_type == tokenize.NAME:
            if token_string in ["def", "class"]:
                function_count += 1
            if indentation == function_count - 1:
                function_count -= 1
            elif function_count >= indentation:
                if token_string == "import":
                    import_line = True
                elif import_line:
                    if token_type == tokenize.NAME and tokens[index+1][1] != 'as':
                        if token_string not in reserved_words:
                            if token_string not in imported_modules:
                                if tokens[index+1][1] == '.': # module.module
                                    parent_module = token_string + '.'
                                else:
                                    if parent_module:
                                        module_string = parent_module + token_string
                                        imported_modules.append(module_string)
                                        parent_module = ''
                                    else:
                                        imported_modules.append(token_string)

    return imported_modules

# TODO: Finish this (even though it isn't used):
[docs]def enumerate_dynamic_imports(tokens):
    """
    Returns a dictionary of all dynamically imported modules (those inside of
    classes or functions) in the form of {<func or class name>: [<modules>]}

    Example:
        >>> enumerate_dynamic_modules(tokens)
        {'myfunc': ['zlib', 'base64']}
    """
    imported_modules = []
    import_line = False
    for index, tok in enumerate(tokens):
        token_type = tok[0]
        token_string = tok[1]
        if token_type == tokenize.NEWLINE:
            import_line = False
        elif token_string == "import":
            try:
                if tokens[index-1][0] == tokenize.NEWLINE:
                    import_line = True
            except IndexError:
                import_line = True # Just means this is the first line
        elif import_line:
            if token_type == tokenize.NAME and tokens[index+1][1] != 'as':
                if token_string not in reserved_words:
                    if token_string not in imported_modules:
                        imported_modules.append(token_string)
    return imported_modules

[docs]def enumerate_method_calls(tokens, modules):
    """
    Returns a list of all object (not module) method calls in the given tokens.

    *modules* is expected to be a list of all global modules imported into the
    source code we're working on.

    For example:
        >>> enumerate_method_calls(tokens)
        ['re.compile', 'sys.argv', 'f.write']
    """
    out = []
    for index, tok in enumerate(tokens):
        token_type = tok[0]
        token_string = tok[1]
        if token_type == tokenize.NAME:
            next_tok_string = tokens[index+1][1]
            if next_tok_string == '(': # Method call
                prev_tok_string = tokens[index-1][1]
                # Check if we're attached to an object or module
                if prev_tok_string == '.': # We're attached
                    prev_prev_tok_string = tokens[index-2][1]
                    if prev_prev_tok_string not in ['""',"''", ']', ')', '}']:
                        if prev_prev_tok_string not in modules:
                            to_replace = "%s.%s" % (
                                prev_prev_tok_string, token_string)
                            if to_replace not in out:
                                out.append(to_replace)
    return out

[docs]def enumerate_builtins(tokens):
    """
    Returns a list of all the builtins being used in *tokens*.
    """
    out = []
    for index, tok in enumerate(tokens):
        token_type = tok[0]
        token_string = tok[1]
        #if token_type == tokenize.NAME:
        if token_string in builtins:
            # Note: I need to test if print can be replaced in Python 3
            special_special = ['print'] # Print is special in Python 2
            if py3:
                special_special = []
            if token_string not in special_special:
                if not token_string.startswith('__'): # Don't count magic funcs
                    if tokens[index-1][1] != '.' and tokens[index+1][1] != '=':
                        if token_string not in out:
                            out.append(token_string)
    return out

[docs]def enumerate_import_methods(tokens):
    """
    Returns a list of imported module methods (such as re.compile) inside
    *tokens*.
    """
    global_imports = enumerate_global_imports(tokens)
    out = []
    for item in global_imports:
        for index, tok in enumerate(tokens):
            try:
                next_tok = tokens[index+1]
                try:
                    next_next_tok = tokens[index+2]
                except IndexError:
                    # Pretend it is a newline
                    next_next_tok = (54, '\n', (1, 1), (1, 2), '#\n')
            except IndexError: # Last token, no biggie
                # Pretend it is a newline here too
                next_tok = (54, '\n', (1, 1), (1, 2), '#\n')
            token_type = tok[0]
            token_string = tok[1]
            if token_string == item:
                if next_tok[1] == '.': # We're calling a method
                    module_method = "%s.%s" % (token_string, next_next_tok[1])
                    if module_method not in out:
                        out.append(module_method)
    return out

[docs]def enumerate_local_modules(tokens, path):
    """
    Returns a list of modules inside *tokens* that are local to *path*.

    **Note:**  Will recursively look inside *path* for said modules.
    """
    # Have to get a list of all modules before we can do anything else
    modules = enumerate_imports(tokens)
    local_modules = []
    parent = ""
    # Now check the local dir for matching modules
    for root, dirs, files in os.walk(path):
        if not parent:
            parent = os.path.split(root)[1]
        for f in files:
            if f.endswith('.py'):
                f = f[:-3] # Strip .py
                module_tree = root.split(parent)[1].replace('/', '.')
                module_tree = module_tree.lstrip('.')
                if module_tree:
                    module = "%s.%s" % (module_tree, f)
                else:
                    module = f
                if module in modules:
                    local_modules.append(module)
    return local_modules

[docs]def get_shebang(tokens):
    """
    Returns the shebang string in *tokens* if it exists.  None if not.
    """
    # This (short) loop preserves shebangs and encoding strings:
    for tok in tokens[0:4]: # Will always be in the first four tokens
        line = tok[4]
        # Save the first comment line if it starts with a shebang
        # (e.g. '#!/usr/bin/env python')
        if shebang.match(line): # Must be first line
            return line