Python Optimization Techniques

CPython Optimizations

CPython includes several built-in optimizations that happen automatically during compilation and execution.

CPython Optimizations

Before Optimization

if True:
    x = 1
else:
    x = 2
y = 2 * 3 + 4

After Optimization

x = 1
y = 10

Compile-time optimizations: dead code elimination, constant folding

Quick Wins

• Use list comprehensions over loops
• Cache function results with @lru_cache
• Use built-in functions (written in C)
• Avoid repeated attribute lookups

Python 3.11+ Features

• Adaptive bytecode specialization
• Zero-cost exception handling
• Faster frame creation
• Improved error messages

Compile-Time Optimizations

Peephole Optimizer

The peephole optimizer performs local optimizations on bytecode:

# Constant folding
x = 2 * 3 + 4  # Becomes: x = 10

# Dead code elimination
if False:
    expensive_operation()  # Removed entirely

# Constant propagation
if True:
    x = 1
else:
    x = 2  # This branch is removed
# Becomes: x = 1

String Interning

# Strings that look like identifiers are interned
a = "hello"
b = "hello"
print(a is b)  # True - same object!

# Force interning for any string
import sys
c = sys.intern("hello world")
d = sys.intern("hello world")
print(c is d)  # True

Performance Best Practices

1. Use Built-in Functions

# Slow: Python loop
def sum_squares_slow(numbers):
    total = 0
    for n in numbers:
        total += n ** 2
    return total

# Fast: Built-in functions (C implementation)
def sum_squares_fast(numbers):
    return sum(n ** 2 for n in numbers)

# Even faster: NumPy for numerical work
import numpy as np
def sum_squares_numpy(numbers):
    arr = np.array(numbers)
    return np.sum(arr ** 2)

2. List Comprehensions vs Loops

import timeit

# Slow: Explicit loop
def loop_approach():
    result = []
    for i in range(1000):
        if i % 2 == 0:
            result.append(i ** 2)
    return result

# Fast: List comprehension
def comprehension_approach():
    return [i ** 2 for i in range(1000) if i % 2 == 0]

# Benchmark
loop_time = timeit.timeit(loop_approach, number=1000)
comp_time = timeit.timeit(comprehension_approach, number=1000)
print(f"Loop: {loop_time:.3f}s")
print(f"Comprehension: {comp_time:.3f}s")
print(f"Speedup: {loop_time/comp_time:.1f}x")

3. Local vs Global Variables

# Global variable access is slower
global_var = 10

def slow_function():
    total = 0
    for i in range(1000000):
        total += global_var  # Global lookup each time
    return total

def fast_function():
    local_var = global_var  # Copy to local
    total = 0
    for i in range(1000000):
        total += local_var  # Local access is faster
    return total

4. String Operations

# Bad: String concatenation in loop
def slow_string_build(items):
    result = ""
    for item in items:
        result += str(item) + ", "  # Creates new string each time
    return result

# Good: Join
def fast_string_build(items):
    return ", ".join(str(item) for item in items)

# Also good: StringIO for complex building
from io import StringIO
def fast_complex_build(items):
    buffer = StringIO()
    for item in items:
        buffer.write(str(item))
        buffer.write(", ")
    return buffer.getvalue()

Memory Optimizations

Using slots

import sys

# Without __slots__: ~296 bytes per instance
class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

# With __slots__: ~56 bytes per instance
class PointOptimized:
    __slots__ = ('x', 'y')
    
    def __init__(self, x, y):
        self.x = x
        self.y = y

# Memory comparison
p1 = Point(3, 4)
p2 = PointOptimized(3, 4)
print(f"Regular: {sys.getsizeof(p1.__dict__)} bytes")
print(f"Optimized: {sys.getsizeof(p2)} bytes")

Generator Expressions

# Memory-hungry: Creates entire list
def sum_squares_list(n):
    return sum([i**2 for i in range(n)])  # List in memory

# Memory-efficient: Generator
def sum_squares_gen(n):
    return sum(i**2 for i in range(n))  # No list created

# Memory usage comparison
import tracemalloc

tracemalloc.start()
result = sum_squares_list(1000000)
current, peak = tracemalloc.get_traced_memory()
print(f"List: {peak / 1024 / 1024:.1f} MB")

tracemalloc.reset_peak()
result = sum_squares_gen(1000000)
current, peak = tracemalloc.get_traced_memory()
print(f"Generator: {peak / 1024 / 1024:.1f} MB")

Caching and Memoization

Using lru_cache

from functools import lru_cache
import time

# Without caching
def fibonacci_slow(n):
    if n < 2:
        return n
    return fibonacci_slow(n-1) + fibonacci_slow(n-2)

# With caching
@lru_cache(maxsize=128)
def fibonacci_fast(n):
    if n < 2:
        return n
    return fibonacci_fast(n-1) + fibonacci_fast(n-2)

# Performance comparison
start = time.time()
result = fibonacci_slow(35)
print(f"Without cache: {time.time() - start:.2f}s")

start = time.time()
result = fibonacci_fast(35)
print(f"With cache: {time.time() - start:.4f}s")

Custom Memoization

def memoize(func):
    cache = {}
    def wrapper(*args):
        if args in cache:
            return cache[args]
        result = func(*args)
        cache[args] = result
        return result
    wrapper.cache = cache  # Expose cache for inspection
    return wrapper

@memoize
def expensive_function(x, y):
    # Simulate expensive computation
    import time
    time.sleep(1)
    return x + y

Profiling Tools

Using cProfile

import cProfile
import pstats

def profile_code():
    # Your code to profile
    result = []
    for i in range(10000):
        result.append(i ** 2)
    return result

# Run profiler
cProfile.run('profile_code()', 'profile_stats')

# Analyze results
p = pstats.Stats('profile_stats')
p.sort_stats('cumulative')
p.print_stats(10)  # Top 10 functions

Using line_profiler

# Install: pip install line_profiler

# Add @profile decorator
@profile
def function_to_profile():
    total = 0
    for i in range(1000000):
        total += i
    return total

# Run with: kernprof -l -v script.py

Using memory_profiler

# Install: pip install memory-profiler

from memory_profiler import profile

@profile
def memory_intensive():
    a = [1] * (10 ** 6)
    b = [2] * (2 * 10 ** 7)
    del b
    return a

# Run with: python -m memory_profiler script.py

Algorithm Optimizations

Choose the Right Data Structure

# Membership testing
# List: O(n)
items_list = list(range(10000))
def check_in_list(item):
    return item in items_list

# Set: O(1)
items_set = set(range(10000))
def check_in_set(item):
    return item in items_set

# Dict for lookups
lookup_dict = {i: i**2 for i in range(10000)}
def get_square(i):
    return lookup_dict.get(i, 0)

Early Exit

# Bad: Always processes entire list
def has_negative_slow(numbers):
    negatives = [n for n in numbers if n < 0]
    return len(negatives) > 0

# Good: Exits early
def has_negative_fast(numbers):
    return any(n < 0 for n in numbers)

Python 3.11+ Optimizations

Adaptive Bytecode

# Python 3.11 specializes bytecode based on usage
def add(a, b):
    return a + b

# First calls: Generic BINARY_ADD
# After seeing integers: Specialized BINARY_ADD_INT
# Result: Up to 60% faster!

Zero-Cost Exceptions

# Python 3.11: Try blocks have no overhead if no exception
try:
    result = normal_operation()
except Exception:
    handle_error()
# No performance penalty for try block!

Common Pitfalls

1. Premature Optimization

# Bad: Optimizing without profiling
def over_optimized(x):
    # Complex bit manipulation for simple task
    return ((x << 1) + x) >> 1  # Just x * 1.5

# Good: Simple and readable
def readable(x):
    return x * 1.5

2. Mutating Default Arguments

# Bad: Mutable default
def bad_append(item, lst=[]):
    lst.append(item)
    return lst

# Good: None default
def good_append(item, lst=None):
    if lst is None:
        lst = []
    lst.append(item)
    return lst

Optimization Workflow

Profile First: Identify actual bottlenecks
Algorithmic Improvements: Better algorithms beat micro-optimizations
Use Built-ins: Leverage C implementations
Cache Results: Avoid redundant computations
Consider NumPy/Cython: For numerical work
Parallel Processing: For CPU-bound tasks

Key Takeaways

Profile before optimizing - measure, don't guess
Built-in functions are fast - they're implemented in C
Local variables are faster than global
List comprehensions beat explicit loops
Generators save memory for large datasets
Caching can provide huge speedups
Choose the right data structure for your use case
Python 3.11+ has significant performance improvements

Table of Contents

CPython Optimizations

Before Optimization

After Optimization

Quick Wins

Python 3.11+ Features