#!/usr/bin/perl -w
use strict;

#
#_* WARNING: This is alpha quality code.
#
# See POD docs below.
#

# $Id: logstatsd 47 2006-04-11 23:34:19Z wu $
our $VERSION = sprintf "0.%03d", q$Revision: 47 $ =~ /(\d+)/g;

#
#_* Libraries
#

use Data::Dumper;
use Benchmark;
use File::Tail;
use Config::IniFiles;
use Log::Log4perl qw(:easy);

use Log::Statistics;

#
#_* Command-line options processing
#
BEGIN {

  use Getopt::Long qw[ :config gnu_getopt ];
  use Pod::Usage;

  use vars qw(
              %opt
              $opt_help $opt_debug $opt_verbose
              @opt_logfiles @opt_servers
              @opt_fields @opt_groups @opt_rrdupdate
              $opt_configfile $opt_section
          );

  # values in %opt can be overridden on the command line
  unless (
      GetOptions (
          '-l|logfile:s'       => \@opt_logfiles,
          '-c|conf:s'          => \$opt_configfile,
          '-s|section:s'       => \$opt_section,
          '-d|daemon'          => \$opt{'daemon'},
          '-dump'              => \$opt{'dump'},
          '-r|report'          => \$opt{'report'},
          '-a|all'             => \$opt{'report'},
          '-rrd:s'             => \$opt{'rrd'},
          '-rrdupdate:s'       => \@opt_rrdupdate,
          '-read:s'            => \$opt{'read'},
          '-servers:s'         => \@opt_servers,
          '-t|time-regexp:s'   => \$opt{'time_regexp'},
          '-line-regexp:s'     => \$opt{'line_regexp'},
          '-f|field:s'         => \@opt_fields,
          '-group:s'            => \@opt_groups,
          '-xml:s'             => \$opt{'xml'},
          '-m|maxlines:i'      => \$opt{'maxlines'},
          '--sleep:i'          => \$opt{'sleep'},
          '--logtail:s'        => \$opt{'logtail'},
          '-u|update:i'        => \$opt{'update'},
          '-debug:i'	       => \$opt_debug,
          '-ssh:s'             => \$opt{'ssh'},
          '-ssh-command:s'     => \$opt{'ssh_command'},
          '-ssh-prefilter:s'   => \$opt{'ssh_pre_filter'},
          '-v|verbose!'	       => \$opt_verbose,
          '-version'           => \$opt{'version'},
          '-help|?'		       => \$opt_help,
      )
  ) { pod2usage( -exitval => 1, -verbose => 0 ) }

  if ( $opt_help ) {
      pod2usage( -exitval => 0, -verbose => 1 ) unless $opt_verbose;
      pod2usage( -exitval => 0, -verbose => 2 ) if     $opt_verbose;
  }

  $opt_debug = 1 if defined $opt_debug and $opt_debug == 0;

}

if ( $opt{'version'} ) {
    print "$0: VERSION: $VERSION\n\n";
    exit;
}

unless ( $opt{'report'} || $opt{'daemon'} ) {
    warn "\nERROR: neither --daemon nor --report specified\n";
    warn "\tUse --report to process the entire log report\n";
    warn "\tUse --daemon to process new log entries as they enter the file\n";
    warn "\tUse --help to see full usage documentation\n\n";
    die;
}

# set up ssh command if specified
my $ssh = get_config('ssh_command') ? get_config('ssh_command') : "ssh";

#
#_* Logging
#
my $log_level = $opt_debug ? $DEBUG : $opt_verbose ? $INFO : $ERROR;
Log::Log4perl->easy_init( $log_level );
my $logger = get_logger( 'default' );

# Catch die for any reason
$SIG{__DIE__} = sub {
  $Log::Log4perl::caller_depth++;
  my $logger = get_logger( 'default' );
  $logger->fatal(@_);
};

$logger->info( "Logging Started" );



#
#_* config
#

my %ini;
if ( $opt_configfile ) {
    tie %ini, 'Config::IniFiles', ( -file => $opt_configfile );
}

if ( $opt_configfile && $opt_section && ! $ini{$opt_section} ) {
    die "Error: section specified ($opt_section) not found in config ($opt_configfile)";
}

if ( $opt_configfile && ! $opt_section && ! $ini{'default'} ) {
    die "Error: section not specified, and no 'default' section found in config ($opt_configfile)";
}

# defaults
my $maxlines = get_config( "maxlines" );


#
#_* main
#

$| = 1;

# cache expensive date parsing
my $date_cache;

# rrd updates are a bit different in daemon mode
my $daemon_mode;

my $log = Log::Statistics->new();

if ( get_config( "line_regexp" ) ) {
    $log->add_line_regexp( get_config( "line_regexp" ) );
}

if ( $opt_section && $ini{$opt_section}{'field_list'} ) {
    for my $entry ( @{ $ini{$opt_section}{'field_list'} } ) {
        my ( $name, $column ) = split /:/, $entry;
        $log->register_field( $name, $column );
    }
}

for my $field_def ( get_fields() ) {
    $logger->info( "got field def: $field_def" );
    my ( $name, $column, $thresholds ) = split /:/, $field_def;
    $log->add_field( $column, $name, $thresholds );
}

for my $group_def ( get_groups() ) {
    $logger->info( "got field def: $group_def" );
    my ( $name_list, @thresholds ) = split /\|/, $group_def;
    my ( @names ) = split /:/, $name_list;
    $log->add_group( [ @names ], join "|", @thresholds );
}

if ( get_config( "time_regexp" ) ) {
    $logger->info( "Adding time regexp: " . get_config( "time_regexp" ) );
    $log->add_time_regexp( get_config( "time_regexp" ) );
}

my $data;

# parse the entire file
if ( get_config( "report" ) ) {
    $logger->info( "generating report" );
    process_full_log( );
    $logger->debug( "generating output" );
    dump_data();
}

# go into tail mode to process live incoming log file data
if ( get_config( "daemon" ) ) {
    $logger->info( "daemonizing" );
    daemonize( get_logfiles() );
}

#
#_* Subroutines
#

#
#__* Process Full Log
#

sub process_full_log {
    my ( $logfile ) = @_;

    unless ( get_logfiles() ) {
        $logger->logconfess( "Error: no logfiles specified?" );
    }

    for my $logfile ( get_logfiles() ) {
        $logger->info( "processing logfile: $logfile" );

        # log file handle
        my $fh;

        my $command;
        if ( $logfile =~ m|^(.*?)\:(.*)$| ) {
            my ( $server, $logfile ) = ( $1, $2 );

            if ( get_config( 'ssh_pre_filter' ) ) {
                my $remote_command = get_config( 'ssh_pre_filter' );
                $remote_command =~ s|\$logfile|$logfile|g;
                $logger->info( "ssh_pre_filter: $remote_command" );
                $command = "$ssh $server $remote_command";
            }
            elsif ( $logfile =~ m|\.gz$| ) {
                $command = "$ssh $server gzcat $logfile";
            }
            else {
                $command = "$ssh $server cat $logfile";
            }
        }
        else {
            if ( $logfile =~ m|\.gz$| ) {
                $command = "gzcat $logfile";
            }
            else {
                $command = "cat $logfile";
            }
        }
        $logger->info( $command );
        open $fh, "$command 2>&1 |" or die "Unable to execute $command: $!";

        my $start = new Benchmark;

      LINE:
        while ( my $line = <$fh> ) {
            chomp $line;
            $data = $log->parse_line( $line, $data );

            #print $line;

            $maxlines--;
            last LINE unless $maxlines;
        }
        my $end = new Benchmark;

        close $fh;

        # check exit status
        unless ( $? eq 0 ) {
            my $status = $? >> 8;
            my $signal = $? & 127;
            $logger->warn( "Error returned from command:\n\tcommand=$command\n\tstatus=$status\n\tsignal=$signal" );
        }

        my $diff = timediff($end, $start);
        my $text = "Time taken was " . timestr($diff, 'all') . " seconds";
        $logger->info( $text );
    }
}

#
#__* Daemonize
#

sub daemonize {
    my ( $logfile ) = @_;

    $logger->info( "Starting in daemon mode: $$" );
    $daemon_mode = 1;

    # set up signal handler to dump data on kill -USR1
    $SIG{USR1} = \&dump_data;

    # set up an alarm to update the log file regularly
    #if ( get_config( "update" ) ) {
    #    my $timeout = 60;
    #    local $SIG{ALRM} = sub { dump_data() };
    #    alarm $timeout;
    #}

    if ( get_config( "ssh" ) ) {
        my $server = get_config( "ssh" );

        if ( get_config( "logtail" ) ) {
            my $logtail = get_config( "logtail" );
            tail_ssh_logtail( $logfile, $server, $logtail );
        }
        else {
            tail_ssh( $logfile, $server );
        }
    }
    else {
        tail_file_tail( $logfile );
    }

    #alarm 0;

}

#
#__* tail implementations
#

sub tail_file_tail {
    my ( $logfile ) = @_;

    # create new process to tail the log
    my $file=File::Tail->new( name => $logfile, maxinterval => 30 );

    while (defined(my $line=$file->read)) {
        $logger->debug( "TAIL: $line" );
        $log->parse_line( $line );
    }
}

sub tail_ssh_logtail {
    my ( $logfile, $server, $logtail ) = @_;

    my $sleep = get_config( "sleep" );
    my $logtail_offset = get_config( "logtail_offset" ) || "";
    my $command = "$ssh $server 'while $logtail $logfile $logtail_offset; do sleep $sleep; done'";

    while ( 1 ) {
        $logger->info( "Opening $logfile on $server:");
        $logger->info( $command );
        open RUN, "$command 2>&1 |" or die "Unable to execute $command: $!";
        while ( my $line = <RUN> ) {
                $log->parse_line( $line );
            }
        close RUN;

        # check exit status
        unless ( $? eq 0 ) {
            my $status = $? >> 8;
            my $signal = $? & 127;
            die "Error running command:$command\n\tstatus=$status\n\tsignal=$signal";
        }

        sleep $sleep;
    }
}

# not recommended at this time
sub tail_ssh {
    my ( $logfile, $server ) = @_;

    my $command = "$ssh $server tail -n 0 -f $logfile";

    $logger->info( "Opening $logfile on $server:");
    $logger->info( $command );
    open RUN, "$command 2>&1 |" or die "Unable to execute $command: $!";
    while ( my $line = <RUN> )
    {
        $log->parse_line( $line );
    }
    close RUN;

    # check exit status
    unless ( $? eq 0 ) {

      my $status = $? >> 8;
      my $signal = $? & 127;

      die "Error running command:$command\n\tstatus=$status\n\tsignal=$signal";
    }
}

#
#__* Data Export
#

sub dump_data {
    if ( get_config( "rrd" ) ) {
        export_rrd( );
    }

    if ( get_config( "dump" ) ) {
        print Dumper $log->{'data'};
        return;
    }

    if ( get_config( "xml" ) ) {
        export_xml( );
    }
}

#
#___* XML
#

sub export_xml {
    #my $xml = get_xml_from_data( $data );
    my $xml = $log->get_xml();

    my $xml_file = get_config( "xml" );

    if ( $xml_file ne "-" ) {

        open ( OUT, ">$xml_file.bak" ) or die "Unable to open $xml_file.bak: $!";
        print OUT $xml;
        close OUT;

        # make writing xml into an atomic operation
        system( "mv", "$xml_file.bak", $xml_file );

        $logger->info( "Wrote data to $xml_file" );
    }
    else {
        print $xml;
    }
}


#
#___* RRD
#

sub get_current_rrd {
    my ( $data, $rrdfile ) = @_;

    my $info = join( ":",
                     time,
                     $data->{'count'} || 0,
                     $data->{'duration'} || 0,
                     $data->{'th_0'} || 0,
                     $data->{'th_1'} || 0,
                     $data->{'th_2'} || 0,
                     $data->{'th_3'} || 0,
                 );

    my $rrd_cmd = "rrdtool update $rrdfile.rrd $info";

    $logger->info( "current_rrd built: $rrd_cmd" );
    return $rrd_cmd;
}

sub rrd_create {
    my ( $filename, $start ) = @_;

    if ( get_config( "rrd_create" ) ) {
        my $step = get_config( 'rrd_step' );
        unless ( $step ) {
            warn "Error: rrd_step not defined in config file - cannot create rrd";
            return;
        }
        my $rrd_create_command = "[ ! -r $filename ] && rrdtool create $filename --start $start --step $step \\\n\t";
        $rrd_create_command .= join " \\\n\t",  @{ get_config( "rrd_create" ) };
        $rrd_create_command .= "\n\n";

        my $out = get_config( 'rrd' );
        if ( $out eq "-" ) {
            print $rrd_create_command;
        }
        else {
            print OUT $rrd_create_command;
        }
    }
}

sub export_rrd {
    my $data = $log->{'data'};

    $logger->info( "exporting RRD data" );

    my $filename = get_config( 'rrd' );

    if ( $daemon_mode ) {
        if ( $filename ne "-" ) {
            $logger->info( "appending rrd data to $filename" );
            open ( OUT, ">>$filename" ) or die "Unable to open $filename: $!";
        }

        for my $update ( get_rrdupdate() ) {
            my $rrd_cmd;
            my ( @keys ) = split /\|/, $update;

            if ( $keys[0] eq "total" ) {
                $rrd_cmd = get_current_rrd( $data->{'total'}, 'total' );
            }
            elsif ( $keys[0] eq "fields" ) {
                $rrd_cmd = get_current_rrd( $data->{$keys[0]}->{$keys[1]}->{$keys[2]}, join( "_", @keys ) );
            }
            elsif ( $keys[0] eq "groups" ) {
                $rrd_cmd = get_current_rrd( $data->{$keys[0]}->{$keys[1]}->{$keys[2]}->{$keys[3]}, join( "_", @keys ) );
            }

            if ( $filename eq "-" ) {
                print "$rrd_cmd\n";
            }
            else {
                print OUT "$rrd_cmd\n";
            }
        }

    }
    else {
        if ( $filename ne "-" ) {
            $logger->info( "re-creating rrd data file: $filename" );
            open ( OUT, ">$filename" ) or die "Unable to open $filename: $!";
        }

        # you can only build rrds from historical data from fields
        # that are grouped by time (obviously).
        my $rrd_data;

        # find all the fields that have been grouped with by time.
        my $pointers = get_data_time_pointers( $data );

        for my $name ( keys %{ $pointers } ) {
            my $pointer = $pointers->{ $name };
            for my $time ( keys %{ $pointer } ) {
                my $time_string = $time;
                $time_string =~ s|\_| |g;
                my $utime = $log->get_utime_from_string( $time_string );
                $rrd_data->{ $utime } = $pointer->{ $time };
            }

            # determine the first time in the db and use that to
            # create the db.  If for some reason the first entry on
            # the hash is undefined, keep iterating thorugh the hash
            # until we find a valid start time.
            my @times = sort keys %{ $rrd_data };
            my $rrd_start = $times[0];
            while ( ! $rrd_start ) {
                shift @times;
                $rrd_start = $times[0];
            }
            # subtract one second from the first time (db must start
            # at least on second before first entry)
            $rrd_start--;
            rrd_create( "$name.rrd", $rrd_start );

            my ( $count, $duration, $th_0, $th_1, $th_2, $th_3 ) =
                ( 0, 0, 0, 0, 0, 0 );
            for my $time ( @times ) {
                $count += $rrd_data->{$time}->{'count'};
                $duration += $rrd_data->{$time}->{'duration'};
                $th_0 += $rrd_data->{$time}->{'th_0'} || 0;
                $th_1 += $rrd_data->{$time}->{'th_1'} || 0;
                $th_2 += $rrd_data->{$time}->{'th_2'} || 0;
                $th_3 += $rrd_data->{$time}->{'th_3'} || 0;

                my @inserts = ( $count, $duration, $th_0, $th_1, $th_2, $th_3 );
                @inserts = map { sprintf("%01d", $_) } @inserts;
                @inserts = ( $time, @inserts );
                my $insert = join ":", @inserts;
                my $rrd_cmd = "rrdtool update $name.rrd $insert";

                #print Dumper $rrd_data->{$time};

                if ( $filename eq "-" ) {
                    print "$rrd_cmd\n";
                }
                else {
                    print OUT "$rrd_cmd\n";
                }
            }
        }

    }

    if ( $filename ne "-" ) {
        close OUT;
    }
}

# search for any fields that are grouped by time.  Return a set of
# pointers to the time entry data so that RRD graphs can be built from
# the data.
sub get_data_time_pointers {
    my ( $data ) = @_;

    my %pointers;

    if ( $data->{'fields'}->{'time'} ) {
        $pointers{'total'} = $data->{'fields'}->{'time'};
    }

    for my $group ( keys %{ $data->{groups} } ) {
        my @layers = split /-/, $group;
        next unless $layers[-1] eq "time";
        for my $key1 ( keys %{ $data->{groups}->{ $group } } ) {
            for my $key2 ( keys %{ $data->{groups}->{ $group }->{$key1} } ) {
                if ( scalar @layers == 2 ) {
                    my $rrd_name = get_rrd_name( $group, $key1 );
                    $pointers{ $rrd_name } = $data->{'groups'}->{ $group }->{$key1};
                } elsif ( scalar @layers == 3 ) {
                    my $rrd_name = get_rrd_name( $group, $key1, $key2 );
                    $pointers{ $rrd_name } = $data->{groups}->{ $group }->{$key1}->{$key2};
                }
            }
        }
    }

    return \%pointers;

}

sub get_rrd_name {
    my ( @names ) = @_;

    my @return;
    for my $name ( @names ) {
        die unless $name;
        $name =~ tr/A-Za-z0-9\-//cd;
        push @return, $name;
    }

    return join( "_", @return );
}

#
#__* Reading config params
#

sub get_config {
    my ( $param ) = @_;

    unless ( $param ) {
        $logger->logconfess( "no param specified" );
        die "Error: get_config called but no param specified";
    }

    if ( $opt{$param} ) {
        return $opt{$param}
    }
    elsif ( $opt_section && $ini{$opt_section}{$param} ) {
        return $ini{$opt_section}{$param}
    }
    elsif ( $ini{'default'}{$param} ) {
        return $ini{'default'}{$param}
    }

    return;
}

sub get_logfiles {
    my @logfiles;

    if ( @opt_logfiles ) {
        @logfiles = @opt_logfiles;
    }
    elsif ( $opt_section && $ini{$opt_section}{'logfiles'} ) {
        $logger->info( "got fields from ini file section: $opt_section" );
        @logfiles = @{ $ini{$opt_section}{'logfiles'} };
    }

    unless ( scalar @logfiles ) {
        return;
    }

    if ( get_servers() ) {
        my @server_logfiles;
        for my $server ( get_servers() ) {
            for my $logfile ( @logfiles ) {
                push @server_logfiles, "$server:$logfile";
            }
        }
        @logfiles = @server_logfiles;
    }

    return @logfiles;
}

sub get_servers {
    my @servers;

    if ( @opt_servers ) {
        for my $serverlist ( @opt_servers ) {
            @servers = ( @servers, split /[\:\,\s]+/, $serverlist );
        }
    }
    elsif ( $opt_section && $ini{$opt_section}{'servers'} ) {
        $logger->info( "got fields from ini file section: $opt_section" );
        @servers = @{ $ini{$opt_section}{'servers'} };
    }

    unless ( scalar @servers ) {
        return;
    }

    return @servers;
}

# todo - genericize get_fields and get_groups to use get_config
sub get_fields {
    my @fields;

    if ( @opt_fields ) {
        $logger->info( "got fields from commmand line" );
        @fields = @opt_fields;
    }
    elsif ( $opt_section && $ini{$opt_section}{'fields'} ) {
        $logger->info( "got fields from ini file section: $opt_section" );
        @fields = @{ $ini{$opt_section}{'fields'} };
    }
    elsif ( $ini{'default'}{'fields'} ) {
        $logger->info( "got fields from ini file 'defaults' section" );
        @fields = @{ $ini{'default'}{'fields'} };
    }

    unless ( scalar @fields ) {
        return;
    }

    # if a field list was specified in the config, use it to look up
    # the columns for each specified field
    if ( $opt_section && $ini{$opt_section}{'field_list'} ) {
        my %field_info;
        for my $entry ( @{ $ini{$opt_section}{'field_list'} } ) {
            my ( $name, $column ) = split /:/, $entry;
            $logger->debug( "indexing field $name => $column" );
            $field_info{ $name } = $column;
        }
        my @return_fields;
        for my $field ( @fields ) {
            my ( $name, $column, $threshold ) = split /:/, $field;
            unless ( defined $column && length $column ) {
                $column = $field_info{ $name };
            }
            unless ( defined $column ) {
                $logger->logconfess( "no column defined for $field" );
                die "no column defined for $field";
            }

            push @return_fields, join( ":",
                                       $name,
                                       $column,
                                       $threshold || "");
        }
        return @return_fields;
    }
    else {
        return @fields;
    }

}

sub get_groups {
    if ( @opt_groups ) {
        $logger->info( "got groups from commmand line" );
        return @opt_groups;
    }
    elsif ( $opt_section && $ini{$opt_section}{'groups'} ) {
        $logger->info( "got groups from ini file section: $opt_section" );
        return @{ $ini{$opt_section}{'groups'} };
    }
    elsif ( $ini{'default'}{'groups'} ) {
        $logger->info( "got groups from ini file default section" );
        return @{ $ini{'default'}{'groups'} };
    }

    return;
}

sub get_rrdupdate {
    if ( @opt_rrdupdate ) {
        $logger->info( "got rrdupdate from commmand line" );
        return @opt_rrdupdate;
    }
    elsif ( $opt_section && $ini{$opt_section}{'rrdupdate'} ) {
        $logger->info( "got rrdupdate from ini file section: $opt_section" );
        return @{ $ini{$opt_section}{'rrdupdate'} };
    }
    elsif ( $ini{'default'}{'rrdupdate'} ) {
        $logger->info( "got rrdupdate from ini file default section" );
        return @{ $ini{'default'}{'rrdupdate'} };
    }

    return;
}

#
#
#_* POD
#
#

1;

__END__


=head1 NAME

  logstatsd - generate summary statistics from log files


=head1 SYNOPSIS

  logstatsd [OPTIONS]

  logstatsd -f status:0 -f duration:5 -l /path/to/logfile --xml
  logstatsd -f status:0 -f duration:5 -l /path/to/logfile --xml /path/to/report.xml

  # help message describing options
  logstatsd --help

  # full man page on logstatsd
  logstatsd --help -v

  # for more examples and explanations, see the EXAMPLES section below.

=head1 DESCRIPTION

Monitoring an application frequently involves monitoring it's log
file(s).  Log files may contain hundreds or thousands of events per
minute.  Parsing the entire log file can be a very cpu intensive task
making near-real-time reporting or monitoring difficult to impossible.

logstatsd was designed to solve these problems and more while being
extremely simple to use and configure.  logstatsd can monitor log
files, parse entries as they enter the log, and store summary data.
logstatsd can then be signaled to export current summary data for
populating an RRD or feeding data to a monitoring application.

logstats parses log entries into fields and extracts fields that you
find interesting, e.g. transaction name, status, duration, date/time,
end user locations, back end server names, etc.  Summary data can be
collected for each interesting field.  So for example, if a
transaction field is specified, the number of hits for each unique
transaction will be counted.  If a duration field is available in the
log, then information about average response times of each transaction
will also be recorded.

Additionally, summary data may be collected for grouped fields.  For
example, if you collect summary statistics about transaction name
grouped with the status, you will see information about the numbers of
success and failures of each transaction.  If you collect summary
statistics about status grouped with time, you can then see statistics
about the successful and unsuccessful transactions per minute.

Also, thresholds may be defined to categorize response times (see
THRESHOLDS section below).

logstatsd is designed to run as a daemon on the server where the log
file resides.  When run in daemon mode, it will tail the log file and
process new entries as they arrive in the log.  Summary data may be
extracted by sending a "kill -USR1" to the logstatsd process id.

Data can be exported to an xml report, or to a script that can be used
to populate a RRD.

logstatsd is designed to parse formatted data in log files. Unlike
other log processing tools which run a series of regexps on each log
entry and count each match, logstatsd splits each entry into a series
of fields using a single regexp.  This makes it useful for files like
an apache access log or CSV files, but less useful for files with less
predicatble contents like an apache error log.

=head2 OPTIONS

The following options are supported by this command:

=over 4

=item -l, --logfile=LOGFILE

Specify log file to be summarized.

=item --field=[NAME][:COLUMN][|THRESHOLD1][|THRESHOLD2...]

Specify a field from the log that should be summarized.  Multiple
field options may be specified.  The index for the first column should be 0.

For example, if your file is a csv, and the first column is "status",
the field definition would be -field status:0.

If a duration field was specified, thresholds can be associated with
the durations (see THRESHOLDS below).

Field names should not contain dashes.

=item --group=[NAME1]:[NAME2...][|THRESHOLD1][|THRESHOLD2...]

Define two fields which should be grouped for summary statistics.
Multiple groups options may be specified.

For example, you might want to keep statics about each
transaction based on status.  In this case, you can simply use the
options "-groups transaction:status".

Note that order is important for display purposes.  transaction:status
would display each transaction, and then each status for the
transaction.  status:transaction will display each status, and then
list each transaction with the associated status.

For display purposes, it will always look better when you use the
field which has the least number of possible values first.

Log::Statistics will handle groups with any number of members, but at
this point logstatsd will only handle groups with two or three fields.

=item -t, --time-regexp <regexp>

Specify the regexp used to parse the time field, if specified.  The
regexp should include a single capture expression, which when run on
the dat field, will return the date and time.

Ideally you should attempt to capture the year, month, day, hour, and
minute.  Do not capture seconds unless you really want summary data
broken down per second.

=item --line-regexp <regexp>

Specify regexp used to parse the entire log entry.  The regexp should
capture each field in the log, which can then be referenced using the
usual column number.  For a simple silly example,

  --line-regexp "^(.*?),(.*?),(.*?)"

This would capture the first three comma-separated fields from the log
entry, and make them available as column number 0, 1, and 2.

=item --version

Display version information.

=item -c, --conf=CONFIGFILE

Specify location of config file.  A config file is a convenient way to
store default information about a type of logfile.  For example,
create a section called "mylog" that contains your field definitions
and time regexp:

  [mylog]
  time_regexp = (\d\d\d\d\/\d\d\/\d\d\s\d\d\:\d\d)\:
  field_list =<<EOF
  status:0
  type:1
  system:2
  transaction:3
  duration:5
  time:7
  EOF

Then, from the command line, simply specify the config file and the
section "mylog", and you can reference fields by name without having
to specify the column number:

  logstatsd  -l /path/to/logfile --xml - -f transaction --group transaction:status

=item -s, --section=SECTION

Specify section to be read from config file.


=item --xml [file]

Generate an xml file containing all currently captured summary data.
If "-" is specified, the xml will be printed to stdout.

=item --rrd [file]

Experimental.  Generate a shell script of "rrdtool update" commands to
update a set of rrd files.  RRD files are not generated directly at
this time, since the script is much more efficient for transport to a
centralized monitoring server and updating rrd files there.  If "-" is
specified, the rrd commands will be printed to stdout.

Once in daemon mode, RRD commands will be generated using the current
time stamp and currently available summary data.

To specify which counters should be used to build rrd files, use the
-rrdupdate option.

If the -rrd option is used with --report, and if a "time" field and a
time-regexp were both defined, then times will be parsed from the
logs, and "rrd update" commands will be generated for each minute.
Currently this behaviour is only available for the total summary data
and not for any defined rrdupdate fields.

Note that currently all RRDs assume that you have defined 4
thresholds.  If you define less thresholds, your RRDs will be a little
larger than necessary.  If you define more, your RRDs will only track
the first 4.

=back

=head3 REPORT OPTIONS

The following options may be used for offline reports

=over 4

=item -r, --report

Read the entire log and generate a single report.  Useful for generating
offline reports.

=item --servers [servername1][,[servername2],...]

May only be used in combination with --report.  Specify a series of
servers on which the log file resides.  For each server, files will be
read using ssh and cat (or gzcat if file ends in .gz).  The log path
must be identical on each server, although globs are allowed in the
log path.

=item --ssh-prefilter [command]

May only be used with --servers.  Specify a command that should be run
on each target server to extract the appropriate records from the log.
The string '$logfile' will be replaced with the actual logfile name.
Example usage:

  --ssh-prefilter='gzgrep -h myTransactionName $logfile'

This can dramatically reduce cpu and bandwidth utilization.

Despite the similar name, this is not to be combined with the --ssh
option.

=back

=head3 DAEMON OPTIONS

The following options may be used for near-real-time reporting.

=over 4

=item -d, --daemon

Enable daemon mode.  In daemon mode, the log file will be opened in
tail mode (using File::Tail).  Each new line that arrives in the log
file will be processed.  Data may be obtained from the running daemon
by sending a USR1 signal (kill -USR1 <pid>).

When combined with -r, the entire log file will be read before opening
in tail mode.  As this is still a bit of a prototype, the log file is
actually opened and read, then closed, and then opened again using
File::Tail.  This leaves a short window where some log entries may not
be processed.

=item --ssh [servername]

Experimental.  May only be used with --daemon.  Specify the remote
server on which the log file lives.

When using this option, you should install Craig H. Rowland's program
'logtail' on the target server, and specify the location using the
logtail config param.  Using the ssh option without the logtail option
may be unstable and is not recommended.


=item --logtail [/path/to/logtail]

Experimental.  Can only be used with -ssh.  Specify the path to the
logtail program written in C by Craig H. Rowland.  From the logtail
documentation:

  This program will read in a standard text file and create an
  offset marker when it reads the end. The offset marker is read
  the next time logtail is run and the text file pointer is moved
  to the offset location. This allows logtail to read in the next
  lines of data following the marker. This is good for marking log
  files for automatic log file checkers to monitor system events.

Note that on the first processing of a new file using logtail, all log
entries will be read in and processed.  On subsequent restarts,
logtail will only process lines not previously seen.

It is recommended that you also define the config param logtail_offset
in your config file to specify the location of the offset file created
by logtail.  If this option is not defined, logtail will create a
number of offset files.

=item --rrdupdate [field1|field2|field3][|field4]

Specify the rrd databases that should be updated when running in
daemon mode.  Any number of rrdupdate options may be specified.

The fields in this option specify keys used to look up the option in
the internal group data.  To look up a *field* directly, use the
definition "fields|fieldname|fieldvalue".  For example, if you
specified a field called "status", you can build an RRD from all
entries with status "SUCCESS" by using this rrdupdate definition:

    fields|status|SUCCESS

In order to track *group* fields (i.e. those specified with -group),
use the definition "groups|name1-name2|value1|value2".  For example,
if you are grouping status by transaction, to build RRDs for all
transactions with status FAIL and name mytrans.do, use this:

    groups|status-transaction|FAIL|mytrans.do

=back

=head1 THRESHOLDS

Thresholds allow monitoring the number of long response times.  For
example, a given transaction might be expected to be complete within 5
seconds.  In addition to measuring the average response time of the
transaction, you may also wish to measure how many transactions are
not completed within 5 seconds.  You may define any number of
thresholds, so you could measure those that you consider to be fast
(under 3 seconds), good (under 5 seconds), slow (over 10 seconds), and
very slow (over 20 seoncds).

NOTE: If a duration field was not defined, then response times
thresholds statistics can not be calculated.


=head1 DIAGNOSTICS

Coming Soon...


=head1 CONFIGURATION AND ENVIRONMENT

The config file is a simple .ini style config file.  Here is an
example config file:

  [test]
  time_regexp = (\d\d\d\d\/\d\d\/\d\d\s\d\d\:\d\d)\:
  xml = /Users/wu/tmp/test.xml
  logfile = /Users/wu/projects/logs/test.log.mini
  field_list =<<EOF
  status:0
  type:1
  system:2
  transaction:3
  duration:5
  time:7
  EOF
  rrdupdate =<<EOF
  fields|status|GOOD
  fields|status|BAD
  groups|status-transaction|BAD|mytrans1
  groups|status-transaction|GOOD|mytrans2
  EOF
  rrd_step = 60
  rrd_create =<<EOF
  DS:duration:COUNTER:1200:0:5000
  DS:hits:COUNTER:1200:0:5000
  DS:over1:COUNTER:1200:0:5000
  DS:over2:COUNTER:1200:0:5000
  DS:over3:COUNTER:1200:0:5000
  DS:over4:COUNTER:1200:0:5000
  RRA:AVERAGE:0.5:1:1440
  RRA:AVERAGE:0.5:5:1440
  RRA:AVERAGE:0.5:30:1440
  RRA:AVERAGE:0.5:120:144
  EOF


Most params can be defined in the config file or on the command line.
Params on the command line override those in the config file.

A full explanation of any configuration system(s) used by the module,
including the names and locations of any configuration files, and the
meaning of any environment variables or properties that can be set. These
descriptions must also include details of any configuration language used.
(See also "Configuration Files" in Chapter 19.)

=head1 EXAMPLES

  # generate an xml report from a CSV file, column 1 contains status,
  # and column 6 contains duration.  generate an xml report of number
  # of responses and average response time data for each status.
  logstatsd -r -f status:0 -f duration:5 -l /path/to/logfile --xml -

  # generate an xml report from a CSV file, column 1 contains status,
  # and column 6 contains duration.  generate an xml report of number
  # of responses and average response time data for each status,
  # including the number of responses that were under 5 seconds, those
  # that were between 5-10 seconds, 10-20 seconds, and over 20
  # seconds.
  logstatsd -r -f status:0:5|10|20 -f duration:5 -l /path/to/logfile --xml -

  # generate an xml report from a CSV file.  Column 1 contains status,
  # column 3 contains transaction name, and column 6 contains
  # duration.  generate an xml report of responses for each status,
  # for each transaction, and also break down response data for each
  # transaction based on status.
  logstatsd -r -f transaction:3 -f status:0 -f duration:5 --group status:transaction -l /path/to/logfile --xml -

  # monitor CSV file for new incoming hits.  generate an xml report on
  # "kill -USR1 <logstats pid>"
  logstatsd -d -f status:0 -f duration:5 -l /path/to/logfile --xml /path/to/report.xml

  # monitor CSV file for new incoming hits.  generate a script to
  # update an RRD database on receipt of "kill -USR1 <logstats pid>"
  logstatsd -d -f status:0 -f duration:5 -l /path/to/logfile --rrd /path/to/rrd_script.sh

  # parse entire CSV file, and then begin monitoring for incoming
  # hits.  generate xml report on completion of full parsing, and then
  # update on each receipt of "kill -USR1 <logstats pid>"
  logstatsd -r -d -f status:0 -f duration:5 -l /path/to/logfile --xml /path/to/report.xml


=head1 DEPENDENCIES

Benchmark - generating stats about long parsing times

File::Tail - for monitoring incoming data in a log

Config::IniFiles - for parsing the logstatsd.conf config file

Log::Log4perl - logging

Log::Statistics - logstatsd comes bundled with Log::Statistics, available from CPAN

Getopt::Long - command line options processing

Pod::Usage - for command line help

=head1 SEE ALSO

http://www.geekfarm.org/wu/muse/LogStatistics.html


=head1 BUGS AND LIMITATIONS

There are no known bugs in this script. Please report problems to
VVu@geekfarm.org

Patches are welcome.

=head1 AUTHOR

VVu@geekfarm.org


=head1 LICENCE AND COPYRIGHT

Copyright (c) 2006, VVu@geekfarm.org
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer in the
  documentation and/or other materials provided with the distribution.

- Neither the name of geekfarm.org nor the names of its contributors
  may be used to endorse or promote products derived from this
  software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.



