#!/usr/bin/perl
##############################################################################
#      $URL: mishin.narod.ru $
#     $Date: 2011-04-11 20:53:20 +0300 (Mon, 14 Feb 2011) $
#   $Author: mishnik $
# $Revision: 1.02 $
#   $Source: split_big_xml.pl $
#   $Description: spit big xml file $
#this script like
#csplit -ksf part. LIFE.20120131.PNL.20120201_20045128826.xml /\<book\>/ "{100}" 2>/dev/null
#but more powerfull, can split file depending on size
# It needs to create script for splitting JMS messages log.

#
# Input parameters for script:
# - full path to file for splitting
# - full path to folder for splitting results
# - maximum size of splitting volume
# - start message expression
# - end message expression
#
# Limitations:
# - execution time for script depends on input file size, however this time must be not greater than 1 hour
#
#example:
# perl ../split_big_xml.pl -f big.xml.txt -o archive -z 1900000000 -s '\<\?xml version="1.0" encoding="UTF-8"\?\>'
###############################################################################
use 5.006;
use strict;
use warnings;
use File::Basename;
use POSIX qw/strftime difftime mktime/;
use Getopt::Long;
use IO::File;
use English qw(-no_match_vars);
use Carp;
use Digest::MD5 qw(md5_hex);
use IPC::Open3 'open3';
use File::Find;

our $VERSION = '0.11';

my $EMPTY          = q{};
my $SPACE          = q{ };
my $COMMA          = q{,};
my $QUOTE          = q{'};
my $PLUS           = q{+};
my $DASH           = q{-};
my $TAB            = q{   };
my @empty          = ();
my $ref_arr_empty  = \@empty;
my %hash_empty     = ();
my $ref_hash_empty = \@empty;
my (@dir2index);
my (
    $regexp2file,      $tradelist,      $dir2xml,
    $end_tag,          $start_tag,      $tradeid_tag,
    $tradeversion_tag, $rebuld_index,   $zipfile,
    $zipsize,          $p,              $len_hash,
    $hash_ref_cvs,     $txt,            $split,
    $split_start_cnt,  $split_filename, $split_cnt_trade_in_file,
    $utp_skip_format,  $ret,            $href_check_nofind,
    $start_time,       $elapsed_time,   $index_by_result,
    $file,             $outdir,         $size,
  )
  = (
    $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY,
    $EMPTY, 0,      $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY,
    $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY,
  );    #best
my $result = GetOptions(
    'file|f=s'      => \$file,       #
    'outdir|o=s'    => \$outdir,     #
    'size|z=s'      => \$size,
    'start_tag|s=s' => \$start_tag
);

if ( !$start_tag ) { $start_tag = '\<\?xml version="1.0" encoding="UTF-8"\?\>' }

$start_time = time;
my @split_vars = ( $file, $outdir, $size, $start_tag, );
$ret          = split_big_file( \@split_vars );
$elapsed_time = wdhms( time - $start_time );
$ret          = print "Time elapsed: $elapsed_time\n";

sub split_big_file {
    my ($arr_ref) = @_;
    my @arr_par   = @{$arr_ref};
    my $file      = $arr_par[0];
    my ( $filename, $directories, $suffix ) = fileparse($file);
    $ret = print "split file $filename \n";

    #print '||' . $file . '||' . "\n";
    open my $FH, q{<}, $file or croak "bah $file";
    gen_multi_file( $FH, $arr_ref );
    close $FH or croak "doh $file";
}

sub gen_multi_file {
    my ( $FH, $arr_ref ) = @_;
    my ( $file, $outdir, $size, $start_tag, ) = @{$arr_ref};

    my $count4suffix = 1;#Numbering of the divided files begins with 01
    my $cnt          = 0;
    my ( $filename, $directories, $suffix ) = fileparse($file);
    my $out_name = substr $filename, 0, length($filename) - 6; #basename($file);
           # from 'rms.intraday.stp.messages.dd-Mon-YYYY_23-59-00.txt'
           # we got 'rms.intraday.stp.messages.dd-Mon-YYYY_23-59-'
           #print "1.&&& $out_name &&&\n";
    my $out      = $EMPTY;
    my @tmp_out  = ();
    my $fname    = $EMPTY;
    my $ar_size  = 0;
    my $filesize = 0;
    my $tmp      = 1;
    my $i        = 0;
    my $sp_mask  = '%02d';

    #print "2.&&& $out_name &&&\n";
    #print "3.&&& $start_tag &&&\n";

    while ( my $line = <$FH> ) {    # for each line

        if ( $line =~ /$start_tag/ ) {

            #print $line. $INPUT_LINE_NUMBER . "\n";
            $ar_size = scalar @tmp_out;
            if ( $ar_size > 0 ) {
                $cnt = sprintf $sp_mask, $count4suffix;
                $fname = "$outdir/${out_name}$cnt.txt";
                if ($tmp) {
                    $ret = print "write file $fname\n";
                    $tmp = 0;
                }
                $ret = add2file( $fname, join $EMPTY, @tmp_out );
                @tmp_out = ();
                my $filesize = -s $fname || die "$fname: $!";
                if ( $filesize > $size ) {
                    $count4suffix++;
                    $cnt = sprintf $sp_mask, $count4suffix;
                    #arch($fname);
                    $fname = "$outdir/${out_name}$cnt.txt";
                    $ret   = print "write file $fname\n";
                }
            }

        }
        push @tmp_out, $line;

        #Out of memory!
        if ( $i > 100 ) {    #commit every 1000 lines
            $ar_size = scalar @tmp_out;
            if ( $ar_size > 0 ) {
                $cnt     = sprintf $sp_mask, $count4suffix;
                $fname   = "$outdir/${out_name}$cnt.txt";
                $ret     = add2file( $fname, join $EMPTY, @tmp_out );
                @tmp_out = ();
                $i       = 0;
            }
        }
        $i++;
    }
    $ret = add2file( $fname, join $EMPTY, @tmp_out );
    #arch($fname);

    return 1;
}

sub arch {
    #add file to archive
    my $file       = shift;
    my $file2arch  = substr $file, 0, length($file) - 4;    #basename($file);
    my $exec_shell = "zip -jm9 $file2arch.zip $file";
    print $exec_shell."\n";
    #my $a          = run_shell($exec_shell);
    return 1;
}

sub add2file {
    my ( $file, $message ) = @_;
    open my $fh, q{>>}, "$file" or croak "unable to open:$file $ERRNO";
    $ret = print {$fh} $message;
    close $fh or croak "unable to close: $file $ERRNO";
    return 1;
}

sub run_shell {
    my ($cmd) = @_;
    my @args = ();
    my ( $HIS_IN, $HIS_OUT, $HIS_ERR ) = ( $EMPTY, $EMPTY, $EMPTY );
    my $childpid = open3( $HIS_IN, $HIS_OUT, $HIS_ERR, $cmd, @args );
    $ret = print {$HIS_IN} "stuff\n";
    close $HIS_IN or croak "unable to close: $HIS_IN $ERRNO";
    ;    # Give end of file to kid.
    if ($HIS_OUT) {
        my @outlines = <$HIS_OUT>;    # Read till EOF.
        $ret = print " STDOUT:\n", @outlines, "\n";
    }
    if ($HIS_ERR) {
        my @errlines = <$HIS_ERR>;    # XXX: block potential if massive
        $ret = print " STDERR:\n", @errlines, "\n";
    }
    close $HIS_OUT or croak "unable to close: $HIS_OUT $ERRNO";
    close $HIS_ERR or croak "unable to close: $HIS_ERR $ERRNO";
    waitpid $childpid, 0;
    if ($CHILD_ERROR) {
        $ret = print "That child exited with wait status of $CHILD_ERROR\n";
    }
    return 1;
}

sub wdhms {
    my ( $weeks, $days, $hours, $minutes, $seconds, $sign, $res ) =
      qw/0 0 0 0 0/;

    use constant M_IN_HOUR => 60;
    use constant H_IN_DAY  => 24;
    use constant D_IN_WEEK => 7;

    $seconds = shift;
    $sign    = $seconds == abs $seconds ? $EMPTY : $DASH;
    $seconds = abs $seconds;

    if ($seconds) {
        ( $seconds, $minutes ) =
          ( $seconds % M_IN_HOUR, int( $seconds / M_IN_HOUR ) );
    }

    if ($minutes) {
        ( $minutes, $hours ) =
          ( $minutes % M_IN_HOUR, int( $minutes / M_IN_HOUR ) );
    }
    if ($hours) {
        ( $hours, $days ) = ( $hours % H_IN_DAY, int( $hours / H_IN_DAY ) );
    }
    if ($days) {
        ( $days, $weeks ) = ( $days % D_IN_WEEK, int( $days / D_IN_WEEK ) );
    }

    if ($weeks)   { $res .= sprintf '%dw ', $weeks }
    if ($days)    { $res .= sprintf '%dd ', $days }
    if ($hours)   { $res .= sprintf '%dh ', $hours }
    if ($minutes) { $res .= sprintf '%dm ', $minutes }
    $res .= sprintf '%ds ', $seconds;

    return "$sign$res";
}
