#!/bin/env perl
# 
# This program splits a large file into multiple small ones. For example if you
# have a mailbox where messages start with a From at the beginning of the line
# (like Netscape uses) then specify the $sep_pat as "\nFrom" and tell the
# program not to deduce file names.
#
# This program was originally written to extract messages from an automated
# archiver. The archiver tagged each message with ":::Message:::" and saved all 
# the messages for a month in a single text file.  Once a month this program 
# would be run to split out all the messages into separate files and attempt to 
# deduce a name for each.  While there are many better ways of doing this under 
# UNIX this script was designed and tested under Windows.
#
# If you have some other use for it then feel free to modify it provided that the 
# portion
# 
# This code remains Copyright Steve Hawtin.  But may be freely copied under 
# the terms of the GNU Copyleft.
#
# Neither the author nor anyone else provides any warrenty
#
    use strict;

    my($a,$b,$sep_pat,$sep_action,$message,$name);
    my($sep_deduce,$sep_names,$sep_dir,$with_tk,$sep_file); 
    my($file_count,$max_name_len);

    # These have to be here or the methods can't find them
    my($main_win,$done_but,$cancel_but,$report_lab);
    my($z_frame,$a_frame,$a1_frame,$b_frame,$c_frame);
    my($d_frame,$e_frame,$f_frame);
    my($f_fsdialog,$fbrowse_but,$f_entry);
    my($d_fsdialog,$dbrowse_but,$d_entry);
    my($odeduce_cb,$onames_entry,$oact_option,$opat_entry);
    my($print_title,$split_by,$size_num,$size_units);
    my($generate_join_scripts);

    $sep_pat = ":::Message:::";
    $sep_action = "remove";
    $sep_deduce = 1;
    $sep_names = "part%03d.dat";
    $sep_dir = ".";
    $file_count = 1;
    $print_title = "";
    $split_by = "pattern";
    $size_num = 10;
    $size_units = "M";
    $max_name_len = 25;
    # Generate a join script if 
    #
    #    copy /b integrat.p* integrat.zip
    #
    #    cat integrat.p* > integrat.zip
    #
    $generate_join_scripts = "";

    if($#ARGV < 0)
      {
        # Assume that we want to use Tk;
        use Tk;
        use Tk::FileSelect;
        require Cwd;

        $sep_dir = Cwd::getcwd();
        $with_tk = 1;

        $main_win = new MainWindow(
            -title   => "TsorT File Splitter",
          );

        $z_frame = $main_win->Frame(
          )->pack(
                -side => 'bottom',
                -fill => 'x',
              );

        # Pick directories and source
        $a_frame = $main_win->Frame(
          )->pack(
                -side => 'top',
                -fill => 'x',
                -padx => 3,
                -pady => 3,
              );

        # Pick directories and source
        $a1_frame = $main_win->Frame(
          )->pack(
                -side => 'top',
                -fill => 'x',
                -padx => 3,
                -pady => 3,
              );

        # Split method
        $b_frame = $main_win->Frame(
          )->pack(
                -side => 'top',
                -fill => 'x',
                -ipadx => 3,
                -ipady => 3,
              );

        # Pattern args
        $c_frame = $main_win->Frame(
            -relief => 'groove',
          )->pack(
                -side => 'top',
                -fill => 'x',
                -ipadx => 3,
                -ipady => 3,
              );

        # Size arguments
        $d_frame = $main_win->Frame(
            -relief => 'groove',
          )->pack(
                -side => 'top',
                -fill => 'x',
                -ipadx => 3,
                -ipady => 3,
              );

        # Target name deduction
        $e_frame = $main_win->Frame(
            -relief => 'groove',
          )->pack(
                -side => 'top',
                -fill => 'x',
                -ipadx => 3,
                -ipady => 3,
              );

        # Target name deduction
        $f_frame = $main_win->Frame(
            -relief => 'groove',
          )->pack(
                -side => 'top',
                -fill => 'x',
                -ipadx => 3,
                -ipady => 3,
              );

        # File Selection frame
        $f_fsdialog = $main_win->FileSelect(
            -directory => $sep_dir,
          );

        $fbrowse_but = $a_frame->Button(
            -text => "Browse",
            -command => \&select_file,
          );

        $f_entry = $a_frame->Entry(
            -textvariable => \$sep_file,
          );

        $a_frame->Label(
            -text => "Source File:"
          )->grid($f_entry,$fbrowse_but);

        # Target Dir frame
        $d_fsdialog = $main_win->FileSelect(
            -directory => $sep_dir,
          );

        $d_entry = $a1_frame->Entry(
            -textvariable => \$sep_dir,
          );

        $dbrowse_but = $a1_frame->Button(
            -text => "Browse",
            -command => \&select_dir,
          );

        $a1_frame->Label(
            -text => "Target Dir:"
          )->grid(
              $d_entry,$dbrowse_but,
            );

        # Split method
        $b_frame->Label(
            -text => "Split By:"
          )->pack(
                -side => 'left',
              );
              
        $b_frame->Optionmenu(
            -options => ["pattern","size"],
            -variable => \$split_by,
          )->pack(
                -side => 'left',
              );
          
        # Pattern
        $oact_option = $c_frame->Optionmenu(
            -options => ["remove","append","prepend"],
            -variable => \$sep_action,
          )->pack(
                -side => 'right',
              );

        $c_frame->Label(-text => "Action:"
          )->pack(
                -side => 'right',
              );
              
        $opat_entry = $c_frame->Entry(
            -textvariable => \$sep_pat
          )->pack(
                -side => 'right',
              );

        $c_frame->Label(-text => "Pattern:"
          )->pack(
                -side => 'right',
              );

        # Size arguments
        $d_frame->Optionmenu(
                -options => ["G","M","k","bytes"],
                -variable => \$size_units,
          )->pack(
                -side => 'right',
              );

        $d_frame->Entry(
                -textvariable => \$size_num
          )->pack(
                -side => 'right',
              );
        $d_frame->Label(-text => "Size:"
          )->pack(
                -side => 'right',
              );

        # Target name
        $odeduce_cb = $e_frame->Checkbutton(
            -variable => \$sep_deduce,
            -text     => "Deduce file names",
          )->pack(
                -side => 'left',
              );
        $e_frame->Checkbutton(
            -variable => \$generate_join_scripts,
            -text     => "Create stitcher scripts",
          )->pack(
                -side => 'top',
              );

        $f_frame->Label(-text => "Target Names:"
          )->pack(
                -side => 'left',
              );
        $f_frame->Entry(
                    -textvariable => \$sep_names
          )->pack(
                -side => 'left',
              );
        $f_frame->Checkbutton(
                    -variable => \$print_title,
                    -text     => "Print File Names",
                  )->pack(
                -side => 'top',
              );

        # Contents of z_frame
        $done_but = $z_frame->Button(
            -text    => "Start",
            -command => \&start,
          )->pack(
                -side => 'left',
                -fill => 'y',
              );

        $cancel_but = $z_frame->Button(
            -text    => "Cancel",
            -command => sub {$main_win->destroy},
          )->pack(
                -side => 'right',
                -fill => 'y',
              );

        $report_lab = $z_frame->Label(
            -text    => "Select File",
          )->pack(
                -fill => 'both',
              );

        MainLoop;
        exit(0);
      }
    elsif($#ARGV != 0)
      {
        # 
        print STDERR "Too many args\n";
        exit(20);
      }
    if(!-r $ARGV[0])
      {
        print STDERR "Cannot open file $ARGV[0]\n";
        exit(20);
      }
    $sep_file = $ARGV[0];

    &split_file;
    exit(0);


sub select_file
  {
    my($new_file,$dir);

    if($sep_file)
      {
        $dir = $sep_file;
        $dir =~ s#/+[^/]+$##;
      }
    else
      {
        $dir = $sep_dir;
      }

    $f_fsdialog->configure(
        -directory => $dir,
      );

    $new_file = $f_fsdialog->Show;

    if($new_file)
      {
        # Remove the disk specifier
        &report($new_file);
        $new_file =~ s#.+(\S\:[^\:]+)$#$1#;
        $sep_file = $new_file;
      }
  }

sub select_dir
  {
    my($new_dir,$dir);

    $dir = $sep_dir;

    $d_fsdialog->configure(
        -directory => $dir,
      );

    $new_dir = $d_fsdialog->Show;
    if($new_dir)
      {
        # Remove the disk specifier
        $new_dir =~ s#.+(\S\:[^\:]+)$#$1#;
        if($new_dir =~ s#/+[^/]+$##)
          {
          }
        else
          {
            &report("Cannot parse dir from $new_dir\n");
            return;
          }
        $sep_dir = $new_dir;
        &report("Set dir $new_dir\n");
        $dir = $new_dir;
        chdir($new_dir);
      }
  }

sub report
  {
    my($val) = @_;

    if($with_tk)
      {
        $report_lab->configure(-text => $val);
      }
    else
      {
        print STDERR $val."\n";
      }
  }

sub start
  {
    if(!$sep_file)
      {
        &report("Source file not yet specified");
        return;
      }
    if(!-r $sep_file)
      {
        &report("Cannot read $sep_file");
        return;
      }
    if(-z $sep_file)
      {
        &report("File $sep_file is empty");
        return;
      }

    if(!-d $sep_dir)
      {
        &report("Cannot find dir $sep_dir");
        return;
      }
    &report("Running...");

    # Tidy things we may need later;
    &init_file_list();

    chdir($sep_dir);
    $done_but->configure(
        -state => 'disabled',
      );
    &split_file;

    if($generate_join_scripts)
      {
        # generate scripts to reassemble the files on a remote system
        generate_stitcher("assemble",$sep_file);
      }

    &report("Completed");
    $done_but->configure(
        -state => 'active',
        -text => "Rerun",
      );
    $cancel_but->configure(
        -state => 'active',
        -text => "Done",
        -command => sub {$main_win->destroy},
      );
  }
        
############################################################

sub split_file
  {
    if($split_by eq "pattern")
      {
        &split_file_pat;
      }
    else
      {
        &split_file_size;
      }
  }

sub split_file_size
  {
    local(*INPUT,*OUTPUT);
    my($len,$buf,$chunk,$togo,$target_size);
    my($writing);

    # Split file by size
    if($sep_deduce)
      {
        &report("Split by size cannot deduce names\n");
        $sep_deduce = "";
      }

    $chunk = 128*1024;

    open(INPUT,$sep_file);
    binmode(INPUT);
    $len = 1;
    if($size_units eq "G")
      {
        $target_size = $size_num*1024*1024*1024;
      }
    elsif($size_units eq "M")
      {
        $target_size = $size_num*1024*1024;
      }
    elsif($size_units eq "k")
      {
        $target_size = $size_num*1024;
      }
    else
      {
        $target_size = $size_num;
      }
    $togo = 0;
    
    while($len)
      {
        if($togo <= 0)
          {
            my($target);
            close(OUTPUT) if($writing);
            $target = &deduce_name;
            open(OUTPUT,">$target");
            binmode(OUTPUT);
            $writing = 1;
            $togo = $target_size;
          }
        $len = $chunk;
        $len = $togo if($len > $togo);
        $len = sysread INPUT,$buf,$len;
        syswrite OUTPUT,$buf,$len;
        $togo -= $len;
      }
    close(OUTPUT) if($writing);
    close(INPUT);
  }

sub split_file_pat
  {
    my($done);
    local(*INPUT,*OUTPUT);

    $/ = $sep_pat;
    open(INPUT,$sep_file);
    binmode(INPUT);

    $done = "";

    while(!$done)
      {
        $message = <INPUT>;
        if(eof(INPUT))
          {
            $done = 1;
          }
        next if($message =~ /^[\s\n\r]*$sep_pat[\s\n\r]*$/);
        
        # The whole message is now in $message, we need to do the following
        #     Try and deduce a name for the message
        #     Tidy up the newlines etc
        #     Save the message
        #
        # Because the $message string is liable to be large we will 
        # just use it as a global variable
        if($sep_action eq "remove")
          {
            $message =~ s/$sep_pat//;
          }
        elsif($sep_action eq "append")
          {
          }
        elsif($sep_action eq "prepend")
          {
            $message =~ s/$sep_pat//;
            $message = $sep_pat . $message;
          }

        $name = &deduce_name;
        print "$name\n" if($print_title);
        $message = &tidy_message($message);
        open(OUTPUT,">$name");
        binmode(OUTPUT);
        print OUTPUT $message;
        close(OUTPUT);
      }
    close(INPUT);
  }

sub deduce_name
  {
    my($n,$i,$file_name,$guess_part_num);

    if(!$sep_deduce)
      {
        my($c);
        $c = 0;
        while(-f sprintf($sep_names,$file_count) && $c < 1000)
          {
            $file_count++;
            $c++;
          }
        if($c >= 1000)
          {
            &report("Cannot find unique name with pattern \"$sep_names\"\n");
          }
        $file_name = sprintf($sep_names,$file_count);

        &remember_file($file_name);
        return($file_name);
      }

    $n = "default";

    # Try and catch any "part 7 of 23" type messages
    $guess_part_num = -1;
    if($message =~ /(\d+)/)
      {
        $guess_part_num = $1;
      }

    if($message =~ /(message|story|title|subject)\s*:\s*(.+)\n/i)
      {
        $n = $2;
      }
    elsif($message =~ /\n\s*\((.+)\)\n/)
      {
        $n = $1;
      }    
    elsif($message =~ /\<(.+)\>/)
      {
        $n = $1;
      }
    elsif($message =~ /\"(.+)\"/)
      {
        $n = $1;
      }    
    elsif($message =~ /\'(.+)\'/)
      {
        $n = $1;
      }
    elsif($message =~ /^(.+)[:\(\{\n]/)
      {
        $n = $1;
      }
    elsif($message =~ /(.+)\s+by\s/)
      {
        $n = $1;
      }

    # Tidy up the bare name
    $n = "\L$n";
    $n =~ s/\(.*\)//;
    $n =~ s/(.txt|.html|.htm)$//;
    $n =~ s/\'//g;
    $n =~ s/[^a-z0-9]/_/g;
    while($n =~ /__/)
      {
        $n =~ s/__/_/;
      }
    $n =~ s/^_+//;
    $n =~ s/_+$//;
    $n = sprintf($sep_names,$file_count++) if(!$n);

    $i = "";

    if(length($n) >= $max_name_len)
      {
        # If we trim the name because it is too long and the result
        # has no digits in it we always add on the 3 digit index
        # that we picked up earlier (this catches the "7 of 56" case) 
        $n = substr($n,0,$max_name_len);
        if($guess_part_num >= 0 && !($n =~ /\d/))
          {
            $i = sprintf("%03d",$guess_part_num);
          }
      }

    while(-r "$n$i.txt")
      {
        # If we have a guess part number we want to try that 
        # first, if we just tried it then reset everything and
        # start counting from 0
        if($guess_part_num > 0 && $i > 0)
          {
            $i = -1;
            $guess_part_num = -1;
          }
        elsif($guess_part_num > 0)
          {
            $i = $guess_part_num - 1;
          }
        elsif($i eq "")
          {
            $i = -1;
          }
        $i = sprintf("%03d",$i+1);
      }
    $file_name = "$n$i.txt";

    &remember_file($file_name);
    return($file_name);
  }

{
  my(@file_list);

sub init_file_list
  {
    @file_list = ();
  }

sub remember_file
  {
    push(@file_list,@_);
  }

sub generate_stitcher
  {
    my($script_name,$full_target_name) = @_;
    local(*SCRIPT);
    my($target_name,$later_pass);

    # Try and get just the file name (without the dir)
    # should use a module really
    $target_name = $full_target_name;
    $target_name = $1 if($full_target_name =~ m#[/\\]+([^/\\]+)$#);

    open(SCRIPT,">$script_name.bat");
    print SCRIPT <<"EndHeader";
# Generated stitcher file to reassemble $target_name
# 
# Not the best way to do it but it works 
IF EXIST $target_name DEL $target_name

EndHeader
    foreach my $file (@file_list)
      {
        my $mod;
        $mod = "$target_name+" if($later_pass);
        print SCRIPT "copy /b $mod$file $target_name\n";
        $later_pass = 1;
      }
    close(SCRIPT);

    open(SCRIPT,">$script_name.sh");
    print SCRIPT <<"EndHeader";
#!/bin/sh
rm $target_name
touch $target_name
EndHeader
    foreach my $file (@file_list)
      {
        print SCRIPT "cat $file >> $target_name\n";
      }
    close(SCRIPT);
  }
}

sub tidy_message
  {
    # Tidy up the file before we save it
    my($message) = @_;

    return($message);
  }

