Process CSV files


We expect two parameters on the command line. The number of parallel processes to run and how many files to process. For parallel 0 means not to use the forking mechanizm at all. We use the number of files instead of accepting the list of files on the command line, becasue it is easier to select a subset of the files this way.

examples/forks/process_csv.pl
use strict;
use warnings;
use Time::HiRes qw(time);
use lib '.';
use Task;
use ForkedProcessCSV;

main();

sub main {
    my ($parallels, $file_count) = @ARGV;
    die "Usage $0 PARALLELS FILE_COUNT\n"  if not defined $file_count;

    my %results;
    my @files = sort glob "data_*.csv";
    die "Not enough files\n" if $file_count > @files;
    @files = @files[0 .. $file_count-1];
    #print "@files";

    my $start = time;
    if ($parallels == 0) {
        for my $file (@files) {
            my $total = Task::process_file($file);
            $results{$file} = $total;
        }
    } else {
        %results = ForkedProcessCSV::process_csv($parallels, @files);
    }

    for my $file (@files) {
        print "$file $results{$file}\n";
    }

    my $end = time;
    my $elapsed = $end-$start;
    printf "Elapsed time %.2f\n", $elapsed;
}

$ perl process_csv.pl 0 1
Elapsed time 1.51

$ perl process_csv.pl 0 4
Elapsed time 5.92

$ perl process_csv.pl 2 4
Elapsed time 4.02

$ perl process_csv.pl 4 4
Elapsed time 4.01

$ perl process_csv.pl 0 10
Elapsed time 15.18

$ perl process_csv.pl 4 10
Elapsed time 9.05