Multiprocess N files: Pool


Analyze N files in parallel.


examples/multiprocess/multiprocess_files.py
from multiprocessing import Pool
import os
import sys
import re

def analyze(filename):
    print("Process {:>5} analyzing {}".format(os.getpid(), filename))
    digits = 0
    spaces = 0
    total  = 0
    with open(filename) as fh:
        for line in fh:
            for char in line:
                total += 1
                if re.search(r'^\d$', char):
                   digits += 1
                if char == ' ':
                   spaces += 1
    return {
        'filename': filename,
        'total': total,
        'digits': digits,
        'spaces': spaces,
    }

def main():
    if len(sys.argv) < 3:
        exit("Usage: {} POOL_SIZE FILEs")
    size  = int(sys.argv[1])
    files = sys.argv[2:]

    with Pool(size) as p:
        results = p.map(analyze, files)
    for res in results:
        print(res)

if __name__ == '__main__':
    main()

$ python multiprocess_files.py 3 multiprocess_*

Process 22688 analyzing multiprocess_files.py
Process 22689 analyzing multiprocess_load.py
Process 22690 analyzing multiprocess_pool_async.py
Process 22688 analyzing multiprocess_pool.py
{'filename': 'multiprocess_files.py', 'total': 833, 'digits': 10, 'spaces': 275}
{'filename': 'multiprocess_load.py', 'total': 694, 'digits': 14, 'spaces': 163}
{'filename': 'multiprocess_pool_async.py', 'total': 695, 'digits': 8, 'spaces': 161}
{'filename': 'multiprocess_pool.py', 'total': 397, 'digits': 3, 'spaces': 80}

We asked it to use 3 processes, so looking at the process ID you can see one of them worked twice. The returned results can be any Python datastructure. A dictionary is usually a good idea.