#!/usr/bin/perl 

use strict;
use constant
Version => '1.04';
use Getopt::Long;
use File::Spec::Functions qw/devnull catfile tmpdir rootdir path curdir/;

<<'DOC'
= summarize - calculate totals and subtotals of columns

= Synopsis
summarize [options] [file...]	

== Options
-h,--help		print this message and exit
-V,--version		print version and exit
-q,--quiet		suppress progress messages
   --rc=file		read file as a configuration file, instead of the default files (see below)
-c,--change=INT		changes in this column generate subtotals; default: 1
-g,--grand		print grand total; default: true if --sumcols is not used, else false
-l,--line		print subtotals when the --change column changes
-o,--original		print the input records as well as (sub)total records
-r,--running		print running sum
-s,--sumcols=col,col...	columns to be summarized; default: all columns
   --tab=sepr_string	column separator; default: a tab
-t,--tex		run in TeX mode, see below
-w,--warn		suppress Perl warnings
   --test		run a self test

= Description
summarize calculates the totals and/or the subtotals of columns in a
file. The totals are printed as an extra record at the end. If more
files are given they are concatenated. If no file is given, standard
input is used.

Columns are defined by the separator string. the default is the tab.

Subtotals are printed if the |--change| option is given. Subtotals are printed
as extra records between records where the
change-field changes. 

In fields containing non-numerical data or illegal numerical characters,
the first of those characters plus all characters following it are removed. 
A warning is issued, unless you use the |--warn| option.
However, leading and trailing whitespace is removed wihtout warning before
the value is used, and if an empty string remains it is counted as zero.

= Options
Options are shown in the Synopsis section in logically identical pairs,
with the full version in the first column and the minimum shorthand
(without any parameters) in the second. Options marked with an asterisk (|*|)
are boolean options. Default values are shown in the third column.

You can use either and you can bundle single character options. Thus:

        summarize --sumcols 2,3 --original --line

can also be done with:

        summarize -s2,3ol


Before evaluating any options, summarize will try to read a system
rc-file, a user rc-file, and, finally an rc-file in the current
directory. The default values for |*|-marked options and for string
options can be set in these files. See the section on RC FILES for more
information. 

You can also set option defaults in an alias. For example:
   alias summarize='summarize --quiet' 

-h,--help	
	prints help information and lets you type |m| to display the
	complete man page or anything else to quit.
-V,--version	
	prints name and (CVS-)version and then quits.
-q,--quiet	
	suppresses messages about the progress summarize is making.
--rc-rc-file	
	Read specified rc-file before processing. The contents of the
	rc-file may override options specified before the |--rc|
	option, therefore it is a good idea to have the habit of specifying
	the |--rc-option| first.
-o,--orginal	
	print the input records as well as the total and/or subtotal
	records.
-c,--change=column	
	print subtotals where the given column changes; if no column number
	is given, column 1 is used
-s,--sumcols=column[,column...]	
	print totals for the columns in a comma-separated list; default:
	all colomns.
-r,--running	
	insert a column with the running total, after each explicitly
	defined summarized column, This means that the |--sumcols|
	option is required. Implies |--original|.
-g,--grand	
	print grand total; if subtotals are not printed, this is
	automatically set (otherwise summarize would do nothing except
	perhaps printing the original data)
--tab=separatorstring	
	sets the string (or Perl expression) used to separate fields;
	default: the tab or, in tex mode (see |--tex|), the &. In
	the output, the same string is used as a separator, unless it
	contains one of the regular expression special characters:
	|[\{($^*.?+|
-l,--line	
	Before every summary line a line with |-------| is printed for every
	summarized field, and after the last of those, the total number of
	records that has been added for the sum is printed in parentheses;
	after this line, an empty line is inserted.
-w,--warn,	
	suppresses the |-w| flag. Without this option the Perl |-w| flag will
	be enabled, and a warning will be printed for every use of a
	non-numerical value in a calculation. In all circumstances, such
	values will be assumed to be zero.
-t,--tex	
	Run in tex mode. This sets the field separator to |&| and filters the
	fields to be summarized, replacing long hyphens (|--|) with minus
	signs (|-|) (so that negative numbers may be represented with double
	dashes) and removing |$|-signs, and TeX-commands.
	This option comes in handy when editing a (La)TeX tabular. Using vi
	for example, you can select lines in the table, and feed the
	selection through summarize with
	        :%!summarize -tos2,3
	to insert totals of columns 2 and 3. The |-l| option then prints
	|\cline{2-3}| instead of |-------|.
--test	
	with this option, summarize runs a bunch of tests, see the
	section EXAMPLES.

= RC files
Unless the environment variable NORC has been set, three rc-files are
executed, if they exist, before reading the command line options, in the
following order:

/etc/summarizerc	the system rc-file
$HOME/.summarizerc	the user rc-file 
./.summarizerc	the local rc-file

You can use these rc-files to set the default values for the options, 
by setting the Perl variable named after
the long version of the options. for example:

   $quiet=1; # run in quiet mode

= Examples
You can run summarize with the |--test| option and thus let it run a
number of tests that are stored in the DATA section. If you do so, you
should get the following output:

Data file:
   a       12      23      34      1
   a       13      25      35      1
   b       34      23      36      1
   b       22      45      37      2
   c       -22     -13     -2      2
   c       -23     23      13      2

summarize all columns
use |-w| because col 1 contains non-numerical values:

   $ summarize -w
   0       36      126     153     9

same, but show the original values, too:
   $ summarize -wo
   a       12      23      34      1
   a       13      25      35      1
   b       34      23      36      1
   b       22      45      37      2
   c       -22     -13     -2      2
   c       -23     23      13      2
   0       36      126     153     9

print subtotals where column 1 changes:
   $ summarize -c
   a       25      48      69      2
   b       56      68      73      3
   c       -45     10      11      4

same, but show the original values, too:
   $ summarize -oc
   a       12      23      34      1
   a       13      25      35      1
   a       25      48      69      2
   b       34      23      36      1
   b       22      45      37      2
   b       56      68      73      3
   c       -22     -13     -2      2
   c       -23     23      13      2
   c       -45     10      11      4

same, but print |------| lines for clarity:
   $ summarize -ocl
   a       12      23      34      1
   a       13      25      35      1
   ------- ------- ------- ------- -------(2)
   a       25      48      69      2
   
   b       34      23      36      1
   b       22      45      37      2
   ------- ------- ------- ------- -------(2)
   b       56      68      73      3
   
   c       -22     -13     -2      2
   c       -23     23      13      2
   ------- ------- ------- ------- -------(2)
   c       -45     10      11      4

print subtotals where col 5 changes,
plus a grand total; include the original data:
   $ summarize -woc5g
   a       12      23      34      1
   a       13      25      35      1
   b       34      23      36      1
   0       59      71      105     1
   b       22      45      37      2
   c       -22     -13     -2      2
   c       -23     23      13      2
   0       -23     55      48      2
   0       36      126     153

same, but only summarize columns 2 and 4 (-w not needed
anymore, as we don't try to summarize column 1)
   $ summarize -oc5gls2,4
   a       12      23      34      1
   a       13      25      35      1
   b       34      23      36      1
           -------         -------(3)
           59              105     1
   
   b       22      45      37      2
   c       -22     -13     -2      2
   c       -23     23      13      2
           -------         -------(3)
           -23             48      2
   
           -------         -------(6)
           36              153

same, now using the long options and inserting running totals:
   $ summarize --sumcols 2,4 --running --line --grand
   a       12      12      23      34      34      1
   a       13      25      25      35      69      1
   b       34      59      23      36      105     1
   b       22      81      45      37      142     2
   c       -22     59      -13     -2      140     2
   c       -23     36      23      13      153     2
           -------                 -------(6)
           36                      153     

= Author
[Wybo Dekker](wybodekker@me.com)

= Copyright
Released under the [GNU General Public License](www.gnu.org/copyleft/gpl.html)
DOC
;

Getopt::Long::Configure("bundling_override");
our( $change, $grand, $line, $original, $quiet, $version, $install  ) =
   ( undef,   0,      0,     0,         1,      0,        0 );
our( $rc,   $running, $sumcols, $tab, $tex,  $suppress_warn ) =
   ( undef, 0,        undef,    undef, undef, 0 );

sub help {
   undef $/;
   open(FH,$0);
   die "Usage:".[split(/= .*\n/,<FH>)]->[2];
} 

handle_options(
    "rc=s"        => \$rc,           
    "o|original!" => \$original,      # print original data
    "c|change:i"  => \$change,        # print subtotals when given column (default 1) changes
    "l|line!"     => \$line,          # print line before and empty line after sum
    "g|grand!"    => \$grand,         # print grand total
    "tab=s"       => \$tab,           # split on this string
    "t|tex!"      => \$tex,           # tex mode
    "s|sumcols=s" => \$sumcols,       # comma sparated list of columns to summarize; default: all
    "r|running!"  => \$running,       # insert running total after each summarized column
    "w|warn!"     => \$suppress_warn, # suppress -w flag
    "h|help"      => \&help,          # print usage message and quit
    "q|quiet!"    => \$quiet,         #
    "test"        => \&testme,        # run the test
    I             => \$install,
    "V|version"   => \$version        # print version and exit
);

if ($version) { print Version."\n"; exit 0; }
if ($install) { system("instscript summarize"); exit 0; }
$^W = !$suppress_warn;
$tab ||= $tex ? '&' : "\t";    # tab is default separator but & in tex mode
                               # don't use \t here because of the next statement
$running &&= 1;                # since $running's value of 1 will be used for decrementing
$running and $original = 1;    # we need the originals to insert running totals

my $meta = quotemeta '+?.*^$()[{|\\';
my $outtab = $tab =~ /[$meta]/ ? "\t" : $tab;
#my $outtab = $tab =~ /[\[\\\{\|\(\$\^\*\.\?\+]/ ? "\t" : $tab;
$grand = 1 unless defined($change);    # we want at least one sum
my $subtotals = 0;
if ( defined($change) ) {
    $change ||= 1;
    $change--;
    $subtotals = 1;
}

my ( $printold, @sumcols ) = (1);
if ($sumcols) {
    @sumcols = ( split ( /,/, $sumcols ) );
    for (@sumcols) {
        $_--;                          # users starts counting at column 1
        next unless $subtotals;
        $printold = 0 if $_ == $change;
    }
}
my $old = "";    # previous contents of changing column
my ( $subcount, $grandcount, @grand, @sub ) = ( 0, 0 );
my $maxcols = 0;    # if all columns are to be summed, we need tokeep track
                    # of the maximum no of columns

while (<>) {
    my $org = $_;
    chomp;
    my @f = split (/$tab/);
    $maxcols = @f if @f > $maxcols;

    $grandcount++;
    $subcount++;

    for my $c ( 0 .. $#f ) {    # for each column in the input
        next
          if defined $change
          && $c == $change;     # exclude the change column from summarizing
        if (@sumcols) {         # if only certain columns are to be summarized
            next
              unless
              grep( /^$c$/, @sumcols );    # next column unless it's in @sumcols
        }
        $grand[$c] ||= 0;    # initialize if necessary
        $_ = $f[$c];         # get contents of the column
        if ($tex) {          # if in tex mode
            s/\\[a-z]//gi;                # remove \bf, \sl and the like
            s/\\,//g;                     # remove spacers
            s/--/-/g;                     # change long dashes to minus sign
            s/\$//g;                      # remove $'s
            s/\\(\\|tabularnewline)//;    # remove newline command
        }
        s/\s*(.*?)\s*/$1/;    # remove leading and trailing spaces
        $_ ||= 0;             # an empty value is counted as 0
        $grand[$c] += $_;
        if ($subtotals) {
            if ( $f[$change] ne $old ) {
                $sub[$change] = $old if $printold;
                $. > 1 and print_line( $subcount - 1, '', @sub );
                @sub      = ();
                $old      = $f[$change];
                $subcount = 1;
            }
            $sub[$c] = 0 unless $sub[$c];
            $sub[$c] += $_;
        }
    }
    print_line( 0, \@grand, @f ) if ($original);
}
if ($subtotals) {
    $sub[$change] = $old if $printold;
    print_line( $subcount, '', @sub ) if $. > 1;
}
$grand and print_line( $grandcount, '', @grand );

sub print_line {
    my $count = shift;
    my $arr   = shift;
    my $lines = $line && ref($arr) ne 'ARRAY';

    # we don't known the max no. of fields in advance:
    my @cols = @sumcols;
    @cols = ( 0 .. $maxcols - 1 ) unless @cols;

    my $off = 0;
    if ($lines) {
        if ($tex) {
            map { $_++ } @cols;
            for (@cols) { print "\\cline{$_-$_}" }
            print "\%$count\n";
        } else {
            for (@cols) {
                print "" . ($outtab) x ( $_ - $off ), "-------";
                $off = $_ - $running;
            }
            print "($count)\n";
        }
    }
    my @ar = @_;
    for (@ar) { defined($_) or $_ = '' }
    if ($running) {
        $off = 1;
        for my $c (@cols) {
            my @x =
              splice( @ar, $c + $off++, 0,
                ref($arr) eq 'ARRAY' ? $$arr[$c] : '' );
            push ( @ar, @x );
        }
    }
    print join ( $outtab, @ar ), "\n";
    print "\n" if $lines;
}


sub handle_options {

    ( my $Myname = $0 ) =~ s/.*\///;    # program name

    my ( $systemrcfile, $userrcfile, $rc_file ) = (
        catfile( rootdir,    'etc',          "${Myname}rc" ),
        catfile( $ENV{HOME}, ".${Myname}rc" ),
        ".${Myname}rc"
    );

    my @rcfiles =
      ();    # here we'll remember which rc-files were read, so we can report
             # about them when $quiet appears to be false
    unless ( defined $ENV{NORC} ) {
        for ( $systemrcfile, $userrcfile, $rc_file ) {
            if ( -s $_ ) {
                push @rcfiles, $_;
                do $_;
            }
        }
    }

    GetOptions(@_) or exit;

    if ($rc) {
        -s $rc or die "Could not find rcfile $rc\n";
        push @rcfiles, $rc;
        do $rc;
    }

    if ( @rcfiles && !$quiet ) {
        warn "The following rc-files were read: @rcfiles\n";
    }
}

sub testme {
    my $tmp = tmpdir . "/$$";
    open( TEST, ">$tmp" );
    print "Data file:\n\n";
    while (<DATA>) {
        last if /^$/;
        print TEST;
        print "\t$_";
    }
    close TEST;
    while (<DATA>) {
        if (/^# ?(.*)/) {
            print "$1\n";
            next;
        }
        chomp;
        print "\nB<$_>\n\n";
        (my $out=`$_ $tmp`) =~ s/^/\t/mg;
        print $out;
    }
    unlink $tmp;
    exit 0;
}
__DATA__
a	12	23	34	1
a	13	25	35	1
b	34	23	36	1
b	22	45	37	2
c	-22	-13	-2	2
c	-23	23	13	2

#
# summarize all columns
# use -w because col 1 contains non-numerical values:
summarize -w
#
# same, but show the original values, too:
summarize -wo
#
# print subtotals where column 1 changes:
summarize -c
#
# same, but show the original values, too:
summarize -oc
#
# same, but print ------ lines for clarity:
summarize -ocl
#
# print subtotals where col 5 changes,
# plus a grand total; include the original data:
summarize -woc5g
#
# same, but only summarize columns 2 and 4 (-w not needed
# anymore, as we don't try to summarize column 1)
#
summarize -oc5gls2,4
#
# same, now using the long options and 
# inserting running totals:
summarize --sumcols 2,4 --running --line --grand
