Source code for pyminifier.obfuscate

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__doc__ = """\
A collection of functions for obfuscating code.
"""

import os, sys, tokenize, keyword, sys, unicodedata
from random import shuffle, choice
from itertools import permutations

# Import our own modules
from . import analyze
from . import token_utils

if sys.version_info.major == 3:
    unichr = chr # So we can support both 2 and 3

try:
    unichr(0x10000) # Will throw a ValueError on narrow Python builds
    HIGHEST_UNICODE = 0x10FFFF # 1114111
except:
    HIGHEST_UNICODE = 0xFFFF # 65535

# Reserved words can be overridden by the script that imports this module
RESERVED_WORDS = keyword.kwlist + analyze.builtins
VAR_REPLACEMENTS = {} # So we can reference what's already been replaced
FUNC_REPLACEMENTS = {}
CLASS_REPLACEMENTS = {}
UNIQUE_REPLACEMENTS = {}

[docs]def obfuscation_machine(use_unicode=False, identifier_length=1): """ A generator that returns short sequential combinations of lower and upper-case letters that will never repeat. If *use_unicode* is ``True``, use nonlatin cryllic, arabic, and syriac letters instead of the usual ABCs. The *identifier_length* represents the length of the string to return using the aforementioned characters. """ # This generates a list of the letters a-z: lowercase = list(map(chr, range(97, 123))) # Same thing but ALL CAPS: uppercase = list(map(chr, range(65, 90))) if use_unicode: # Python 3 lets us have some *real* fun: allowed_categories = ('LC', 'Ll', 'Lu', 'Lo', 'Lu') # All the fun characters start at 1580 (hehe): big_list = list(map(chr, range(1580, HIGHEST_UNICODE))) max_chars = 1000 # Ought to be enough for anybody :) combined = [] rtl_categories = ('AL', 'R') # AL == Arabic, R == Any right-to-left last_orientation = 'L' # L = Any left-to-right # Find a good mix of left-to-right and right-to-left characters while len(combined) < max_chars: char = choice(big_list) if unicodedata.category(char) in allowed_categories: orientation = unicodedata.bidirectional(char) if last_orientation in rtl_categories: if orientation not in rtl_categories: combined.append(char) else: if orientation in rtl_categories: combined.append(char) last_orientation = orientation else: combined = lowercase + uppercase shuffle(combined) # Randomize it all to keep things interesting while True: for perm in permutations(combined, identifier_length): perm = "".join(perm) if perm not in RESERVED_WORDS: # Can't replace reserved words yield perm identifier_length += 1
[docs]def apply_obfuscation(source): """ Returns 'source' all obfuscated. """ global keyword_args global imported_modules tokens = token_utils.listified_tokenizer(source) keyword_args = analyze.enumerate_keyword_args(tokens) imported_modules = analyze.enumerate_imports(tokens) variables = find_obfuscatables(tokens, obfuscatable_variable) classes = find_obfuscatables(tokens, obfuscatable_class) functions = find_obfuscatables(tokens, obfuscatable_function) for variable in variables: replace_obfuscatables( tokens, obfuscate_variable, variable, name_generator) for function in functions: replace_obfuscatables( tokens, obfuscate_function, function, name_generator) for _class in classes: replace_obfuscatables(tokens, obfuscate_class, _class, name_generator) return token_utils.untokenize(tokens)
[docs]def find_obfuscatables(tokens, obfunc, ignore_length=False): """ Iterates over *tokens*, which must be an equivalent output to what tokenize.generate_tokens() produces, calling *obfunc* on each with the following parameters: - **tokens:** The current list of tokens. - **index:** The current position in the list. *obfunc* is expected to return the token string if that token can be safely obfuscated **or** one of the following optional values which will instruct find_obfuscatables() how to proceed: - **'__skipline__'** Keep skipping tokens until a newline is reached. - **'__skipnext__'** Skip the next token in the sequence. If *ignore_length* is ``True`` then single-character obfuscatables will be obfuscated anyway (even though it wouldn't save any space). """ global keyword_args keyword_args = analyze.enumerate_keyword_args(tokens) global imported_modules imported_modules = analyze.enumerate_imports(tokens) skip_line = False skip_next = False obfuscatables = [] for index, tok in enumerate(tokens): token_type = tok[0] if token_type == tokenize.NEWLINE: skip_line = False if skip_line: continue result = obfunc(tokens, index, ignore_length=ignore_length) if result: if skip_next: skip_next = False elif result == '__skipline__': skip_line = True elif result == '__skipnext__': skip_next = True elif result in obfuscatables: pass else: obfuscatables.append(result) else: # If result is empty we need to reset skip_next so we don't skip_next = False # accidentally skip the next identifier return obfuscatables # Note: I'm using 'tok' instead of 'token' since 'token' is a built-in module
[docs]def obfuscatable_variable(tokens, index, ignore_length=False): """ Given a list of *tokens* and an *index* (representing the current position), returns the token string if it is a variable name that can be safely obfuscated. Returns '__skipline__' if the rest of the tokens on this line should be skipped. Returns '__skipnext__' if the next token should be skipped. If *ignore_length* is ``True``, even variables that are already a single character will be obfuscated (typically only used with the ``--nonlatin`` option). """ tok = tokens[index] token_type = tok[0] token_string = tok[1] line = tok[4] if index > 0: prev_tok = tokens[index-1] else: # Pretend it's a newline (for simplicity) prev_tok = (54, '\n', (1, 1), (1, 2), '#\n') prev_tok_type = prev_tok[0] prev_tok_string = prev_tok[1] try: next_tok = tokens[index+1] except IndexError: # Pretend it's a newline next_tok = (54, '\n', (1, 1), (1, 2), '#\n') next_tok_string = next_tok[1] if token_string == "=": return '__skipline__' if token_type != tokenize.NAME: return None # Skip this token if token_string.startswith('__'): return None if next_tok_string == ".": if token_string in imported_modules: return None if prev_tok_string == ".": return '__skipnext__' if prev_tok_string == "for": if len(token_string) > 2: return token_string if token_string == "for": return None if token_string in keyword_args.keys(): return None if token_string in ["def", "class", 'if', 'elif', 'import']: return '__skipline__' if prev_tok_type != tokenize.INDENT and '=' not in line: return '__skipline__' if not ignore_length: if len(token_string) < 3: return None if token_string in RESERVED_WORDS: return None return token_string
[docs]def obfuscatable_class(tokens, index, **kwargs): """ Given a list of *tokens* and an *index* (representing the current position), returns the token string if it is a class name that can be safely obfuscated. """ tok = tokens[index] token_type = tok[0] token_string = tok[1] if index > 0: prev_tok = tokens[index-1] else: # Pretend it's a newline (for simplicity) prev_tok = (54, '\n', (1, 1), (1, 2), '#\n') prev_tok_string = prev_tok[1] if token_type != tokenize.NAME: return None # Skip this token if token_string.startswith('__'): # Don't mess with specials return None if prev_tok_string == "class": return token_string
[docs]def obfuscatable_function(tokens, index, **kwargs): """ Given a list of *tokens* and an *index* (representing the current position), returns the token string if it is a function or method name that can be safely obfuscated. """ tok = tokens[index] token_type = tok[0] token_string = tok[1] if index > 0: prev_tok = tokens[index-1] else: # Pretend it's a newline (for simplicity) prev_tok = (54, '\n', (1, 1), (1, 2), '#\n') prev_tok_string = prev_tok[1] if token_type != tokenize.NAME: return None # Skip this token if token_string.startswith('__'): # Don't mess with specials return None if prev_tok_string == "def": return token_string
[docs]def replace_obfuscatables(module, tokens, obfunc, replace, name_generator, table=None): """ Iterates over *tokens*, which must be an equivalent output to what tokenize.generate_tokens() produces, replacing the given identifier name (*replace*) by calling *obfunc* on each token with the following parameters: - **module:** The name of the script we're currently obfuscating. - **tokens:** The current list of all tokens. - **index:** The current position. - **replace:** The token string that we're replacing. - **replacement:** A randomly generated, unique value that will be used to replace, *replace*. - **right_of_equal:** A True or False value representing whether or not the token is to the right of an equal sign. **Note:** This gets reset to False if a comma or open paren are encountered. - **inside_parens:** An integer that is incremented whenever an open paren is encountered and decremented when a close paren is encountered. - **inside_function:** If not False, the name of the function definition we're inside of (used in conjunction with *keyword_args* to determine if a safe replacement can be made). *obfunc* is expected to return the token string if that token can be safely obfuscated **or** one of the following optional values which will instruct find_obfuscatables() how to proceed: - **'__open_paren__'** Increment the inside_parens value - **'__close_paren__'** Decrement the inside_parens value - **'__comma__'** Reset the right_of_equal value to False - **'__right_of_equal__'** Sets the right_of_equal value to True **Note:** The right_of_equal and the inside_parens values are reset whenever a NEWLINE is encountered. When obfuscating a list of files, *table* is used to keep track of which obfuscatable identifiers are which inside each resulting file. It must be an empty dictionary that will be populated like so:: {orig_name: obfuscated_name} This *table* of "what is what" will be used to ensure that references from one script/module that call another are kept in sync when they are replaced with obfuscated values. """ # Pretend the first line is '#\n': skip_line = False skip_next = False right_of_equal = False inside_parens = 0 inside_function = False indent = 0 function_indent = 0 replacement = next(name_generator) for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type == tokenize.NEWLINE: skip_line = False right_of_equal = False inside_parens = 0 elif token_type == tokenize.INDENT: indent += 1 elif token_type == tokenize.DEDENT: indent -= 1 if inside_function and function_indent == indent: function_indent = 0 inside_function = False if token_string == "def": function_indent = indent function_name = tokens[index+1][1] inside_function = function_name result = obfunc( tokens, index, replace, replacement, right_of_equal, inside_parens, inside_function ) if result: if skip_next: skip_next = False elif skip_line: pass elif result == '__skipline__': skip_line = True elif result == '__skipnext__': skip_next = True elif result == '__open_paren__': right_of_equal = False inside_parens += 1 elif result == '__close_paren__': inside_parens -= 1 elif result == '__comma__': right_of_equal = False elif result == '__right_of_equal__': right_of_equal = True else: if table: # Save it for later use in other files combined_name = "%s.%s" % (module, token_string) try: # Attempt to use an existing value tokens[index][1] = table[0][combined_name] except KeyError: # Doesn't exist, add it to table table[0].update({combined_name: result}) tokens[index][1] = result else: tokens[index][1] = result
[docs]def obfuscate_variable( tokens, index, replace, replacement, right_of_equal, inside_parens, inside_function): """ If the token string inside *tokens[index]* matches *replace*, return *replacement*. *right_of_equal*, and *inside_parens* are used to determine whether or not this token is safe to obfuscate. """ def return_replacement(replacement): VAR_REPLACEMENTS[replacement] = replace return replacement tok = tokens[index] token_type = tok[0] token_string = tok[1] if index > 0: prev_tok = tokens[index-1] else: # Pretend it's a newline (for simplicity) prev_tok = (54, '\n', (1, 1), (1, 2), '#\n') prev_tok_string = prev_tok[1] try: next_tok = tokens[index+1] except IndexError: # Pretend it's a newline next_tok = (54, '\n', (1, 1), (1, 2), '#\n') if token_string == "import": return '__skipline__' if next_tok[1] == '.': if token_string in imported_modules: return None if token_string == "=": return '__right_of_equal__' if token_string == "(": return '__open_paren__' if token_string == ")": return '__close_paren__' if token_string == ",": return '__comma__' if token_type != tokenize.NAME: return None # Skip this token if token_string.startswith('__'): return None if prev_tok_string == 'def': return '__skipnext__' # Don't want to touch functions if token_string == replace and prev_tok_string != '.': if inside_function: if token_string not in keyword_args[inside_function]: if not right_of_equal: if not inside_parens: return return_replacement(replacement) else: if next_tok[1] != '=': return return_replacement(replacement) elif not inside_parens: return return_replacement(replacement) else: if next_tok[1] != '=': return return_replacement(replacement) elif not right_of_equal: if not inside_parens: return return_replacement(replacement) else: if next_tok[1] != '=': return return_replacement(replacement) elif right_of_equal and not inside_parens: return return_replacement(replacement)
[docs]def obfuscate_function(tokens, index, replace, replacement, *args): """ If the token string (a function) inside *tokens[index]* matches *replace*, return *replacement*. """ def return_replacement(replacement): FUNC_REPLACEMENTS[replacement] = replace return replacement tok = tokens[index] token_type = tok[0] token_string = tok[1] prev_tok = tokens[index-1] prev_tok_string = prev_tok[1] if token_type != tokenize.NAME: return None # Skip this token if token_string.startswith('__'): return None if token_string == replace: if prev_tok_string != '.': if token_string == replace: return return_replacement(replacement) else: parent_name = tokens[index-2][1] if parent_name in CLASS_REPLACEMENTS: # This should work for @classmethod methods return return_replacement(replacement) elif parent_name in VAR_REPLACEMENTS: # This covers regular ol' instance methods return return_replacement(replacement)
[docs]def obfuscate_class(tokens, index, replace, replacement, *args): """ If the token string (a class) inside *tokens[index]* matches *replace*, return *replacement*. """ def return_replacement(replacement): CLASS_REPLACEMENTS[replacement] = replace return replacement tok = tokens[index] token_type = tok[0] token_string = tok[1] prev_tok = tokens[index-1] prev_tok_string = prev_tok[1] if token_type != tokenize.NAME: return None # Skip this token if token_string.startswith('__'): return None if prev_tok_string != '.': if token_string == replace: return return_replacement(replacement)
[docs]def obfuscate_unique(tokens, index, replace, replacement, *args): """ If the token string (a unique value anywhere) inside *tokens[index]* matches *replace*, return *replacement*. .. note:: This function is only for replacing absolutely unique ocurrences of *replace* (where we don't have to worry about their position). """ def return_replacement(replacement): UNIQUE_REPLACEMENTS[replacement] = replace return replacement tok = tokens[index] token_type = tok[0] token_string = tok[1] if token_type != tokenize.NAME: return None # Skip this token if token_string == replace: return return_replacement(replacement)
[docs]def remap_name(name_generator, names, table=None): """ Produces a series of variable assignments in the form of:: <obfuscated name> = <some identifier> for each item in *names* using *name_generator* to come up with the replacement names. If *table* is provided, replacements will be looked up there before generating a new unique name. """ out = "" for name in names: if table and name in table[0].keys(): replacement = table[0][name] else: replacement = next(name_generator) out += "%s=%s\n" % (replacement, name) return out
[docs]def insert_in_next_line(tokens, index, string): """ Inserts the given string after the next newline inside tokens starting at *tokens[index]*. Indents must be a list of indentation tokens that will preceeed the insert (can be an empty list). """ tokenized_string = token_utils.listified_tokenizer(string) for i, tok in list(enumerate(tokens[index:])): token_type = tok[0] if token_type in [tokenize.NL, tokenize.NEWLINE]: for count, item in enumerate(tokenized_string): tokens.insert(index+count+i+1, item) break
[docs]def obfuscate_builtins(module, tokens, name_generator, table=None): """ Inserts an assignment, '<obfuscated identifier> = <builtin function>' at the beginning of *tokens* (after the shebang and encoding if present) for every Python built-in function that is used inside *tokens*. Also, replaces all of said builti-in functions in *tokens* with each respective obfuscated identifer. Obfuscated identifier names are pulled out of name_generator via next(). If *table* is provided, replacements will be looked up there before generating a new unique name. """ used_builtins = analyze.enumerate_builtins(tokens) obfuscated_assignments = remap_name(name_generator, used_builtins, table) replacements = [] for assignment in obfuscated_assignments.split('\n'): replacements.append(assignment.split('=')[0]) replacement_dict = dict(zip(used_builtins, replacements)) if table: table[0].update(replacement_dict) iter_replacements = iter(replacements) for builtin in used_builtins: replace_obfuscatables( module, tokens, obfuscate_unique, builtin, iter_replacements) # Check for shebangs and encodings before we do anything else skip_tokens = 0 matched_shebang = False matched_encoding = False for tok in tokens[0:4]: # Will always be in the first four tokens line = tok[4] if analyze.shebang.match(line): # (e.g. '#!/usr/bin/env python') if not matched_shebang: matched_shebang = True skip_tokens += 1 elif analyze.encoding.match(line): # (e.g. '# -*- coding: utf-8 -*-') if not matched_encoding: matched_encoding = True skip_tokens += 1 insert_in_next_line(tokens, skip_tokens, obfuscated_assignments)
[docs]def obfuscate_global_import_methods(module, tokens, name_generator, table=None): """ Replaces the used methods of globally-imported modules with obfuscated equivalents. Updates *tokens* in-place. *module* must be the name of the module we're currently obfuscating If *table* is provided, replacements for import methods will be attempted to be looked up there before generating a new unique name. """ global_imports = analyze.enumerate_global_imports(tokens) local_imports = analyze.enumerate_local_modules(tokens, os.getcwd()) module_methods = analyze.enumerate_import_methods(tokens) # Make a 1-to-1 mapping dict of module_method<->replacement: if table: replacement_dict = {} for module_method in module_methods: if module_method in table[0].keys(): replacement_dict.update({module_method: table[0][module_method]}) else: replacement_dict.update({module_method: next(name_generator)}) # Update the global lookup table with the new entries: table[0].update(replacement_dict) else: method_map = [next(name_generator) for i in module_methods] replacement_dict = dict(zip(module_methods, method_map)) import_line = False # Replace module methods with our obfuscated names in *tokens* for module_method in module_methods: for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type != tokenize.NAME: continue # Speedup tokens[index+1][1] if token_string == module_method.split('.')[0]: if tokens[index+1][1] == '.': if tokens[index+2][1] == module_method.split('.')[1]: if table: # Attempt to use an existing value tokens[index][1] = table[0][module_method] tokens[index+1][1] = "" tokens[index+2][1] = "" else: tokens[index][1] = replacement_dict[module_method] tokens[index+1][1] = "" tokens[index+2][1] = "" # Insert our map of replacement=what after each respective module import for module_method, replacement in replacement_dict.items(): indents = [] index = 0 for tok in tokens[:]: token_type = tok[0] token_string = tok[1] if token_type == tokenize.NEWLINE: import_line = False elif token_type == tokenize.INDENT: indents.append(tok) elif token_type == tokenize.DEDENT: indents.pop() elif token_string == "import": import_line = True elif import_line: if token_string == module_method.split('.')[0]: # Insert the obfuscation assignment after the import imported_module = ".".join(module_method.split('.')[:-1]) if table and imported_module in local_imports: line = "%s=%s.%s\n" % ( # This ends up being 6 tokens replacement_dict[module_method], imported_module, replacement_dict[module_method] ) else: line = "%s=%s\n" % ( # This ends up being 6 tokens replacement_dict[module_method], module_method) for indent in indents: # Fix indentation line = "%s%s" % (indent[1], line) index += 1 insert_in_next_line(tokens, index, line) index += 6 # To make up for the six tokens we inserted index += 1
[docs]def obfuscate(module, tokens, options, name_generator=None, table=None): """ Obfuscates *tokens* in-place. *options* is expected to be the options variable passed through from pyminifier.py. *module* must be the name of the module we're currently obfuscating If *name_generator* is provided it will be used to obtain replacement values for identifiers. If not, a new instance of If *table* is given (should be a list containing a single dictionary), it will be used to perform lookups of replacements and any new replacements will be added to it. """ # Need a universal instance of our generator to avoid duplicates identifier_length = int(options.replacement_length) ignore_length = False if not name_generator: if options.use_nonlatin: ignore_length = True if sys.version_info[0] == 3: name_generator = obfuscation_machine( use_unicode=True, identifier_length=identifier_length ) else: print( "ERROR: You can't use nonlatin characters without Python 3") else: name_generator = obfuscation_machine( identifier_length=identifier_length) if options.obfuscate: variables = find_obfuscatables( tokens, obfuscatable_variable, ignore_length=ignore_length) classes = find_obfuscatables( tokens, obfuscatable_class) functions = find_obfuscatables( tokens, obfuscatable_function) for variable in variables: replace_obfuscatables( module, tokens, obfuscate_variable, variable, name_generator, table ) for function in functions: replace_obfuscatables( module, tokens, obfuscate_function, function, name_generator, table ) for _class in classes: replace_obfuscatables( module, tokens, obfuscate_class, _class, name_generator, table) obfuscate_global_import_methods(module, tokens, name_generator, table) #print("# table: \n%s" % table) obfuscate_builtins(module, tokens, name_generator, table) else: if options.obf_classes: classes = find_obfuscatables( tokens, obfuscatable_class) for _class in classes: replace_obfuscatables( module, tokens, obfuscate_class, _class, name_generator, table ) if options.obf_functions: functions = find_obfuscatables( tokens, obfuscatable_function) for function in functions: replace_obfuscatables( module, tokens, obfuscate_function, function, name_generator, table ) if options.obf_variables: variables = find_obfuscatables( tokens, obfuscatable_variable) for variable in variables: replace_obfuscatables( module, tokens, obfuscate_variable, variable, name_generator, table ) if options.obf_import_methods: obfuscate_global_import_methods( module, tokens, name_generator, table) if options.obf_builtins: obfuscate_builtins(module, tokens, name_generator, table)
if __name__ == "__main__": global name_generator try: source = open(sys.argv[1]).read() except: print("Usage: %s <filename.py>" % sys.argv[0]) sys.exit(1) if sys.version_info[0] == 3: name_generator = obfuscation_machine(use_unicode=True) else: name_generator = obfuscation_machine(identifier_length=1) source = apply_obfuscation(source) print(source)