#!/usr/local/bin/perl
#
#  @(#) Perl -- Re-assemble multipart mime email messages from mailboxes.
#  @(#) $Id: mime-multipart.pl,v 1.2 1999/02/08 19:22:44 jaalto Exp $
#
#  File id
#
#       .Copyright (C) 1998-1999 Jari Aalto
#       .Created: 1998-05
#       .$Contactid: <jari.aalto@poboxes.com> $
#       .$Keywords: Perl, merge, mail, mime, partials $
#       .$Url: http://www.netforward.com/poboxes/?jari.aalto $
#       .$Perl: 5.004 $
#
#       This program is free software; you can redistribute it and/or
#       modify it under the terms of the GNU General Public License as
#       published by the Free Software Foundation; either version 2 of
#       the License, or (at your option) any later version.
#
#       This program is distributed in the hope that it will be useful, but
#       WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#       General Public License for more details.
#
#       You should have received a copy of the GNU General Public License along
#       with this program; if not, write to the Free Software Foundation,
#       Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#   About program layout
#
#	Code written with Unix Emacs and indentation controlled with
#	Emacs package tinytab.el, a generic tab minor mode for programming.
#
#       The {{ }}} marks you see in this file are party of file "fold"
#       control package called folding.el (Unix Emacs lisp package).
#       ftp://ftp.csd.uu.se/pub/users/andersl/beta/ to get the latest.
#
#       There is also lines that look like # ....... &tag ... and they
#       are generated by Emacs Lisp package `tinybm.el', which is also
#       document structure tool. You can jump between the blocks with
#       Ctrl-up and Ctrl-down keys and create those "bookmarks" with
#       Emacs M-x tibm-insert. See www contact site below.
#
#   Funny identifiers at the top of file
#
#       The GNU RCS ident(1) program can print useful information out
#       of all variables that are in format $ IDENTIFIER: text $
#       See also Unix man pages for command what(1) which outputs all lines
#       matching @( # ). Try commands:
#
#       % what  PRGNAME
#       % ident PRGNAME
#
#   Introduction
#
#       Please start this perl script with options
#
#           --help      to get the help page
#
#   Www contact site
#
#       See http://www.netforward.com/poboxes/?jari.aalto and navigate
#       to html pages in the site to get more information about me
#	and my tools (Emacs, Perl, procmail mostly)
#
#   Description
#
#	This script reads mailboxes and moulds MIME multipart messages
#	to single files. The mltiparts may be spread over any number
#	of files. The assembling happens in memory, so prepare to have
#	plenty of if you are going to assemble big tar.gz kit multiparts.
#
#   Change Log
#
#	(none)

BEGIN { require 5.003 }

use integer;
use strict;

use Env;
use English;

use Getopt::Long;


    use vars qw ( $VERSION );

    #   This is for use of Makefile.PL and ExtUtils::MakeMaker
    #   So that it puts the tardist number in format YYYY.MMDD
    #   The REAL version number is defined later

    #   The following variable is updated by my Emacs setup whenever
    #   this file is saved

    $VERSION = '1999.0208';

# ****************************************************************************
#
#   DESCRIPTION
#
#       Set global variables for the program
#
#   INPUT PARAMETERS
#
#       none
#
#   RETURN VALUES
#
#       none
#
# ****************************************************************************

sub Initialize ()
{
    use vars qw
    (
        $PROGNAME
        $LIB

        $FILE_ID
        $VERSION
        $CONTACT
        $URL
    );

    $PROGNAME = "mime+.pl";     # Hard-coded. Not looked from $0
    $LIB      = $PROGNAME;      # library where each function belongs: PRGNAME

    $FILE_ID   = q$Id: mime-multipart.pl,v 1.2 1999/02/08 19:22:44 jaalto Exp $;
    $VERSION  = (split ' ', $FILE_ID)[2];    # version number in format N.NN+
    $CONTACT  = "<jari.aalto\@poboxes.com>"; # Who is the maintainer
    $URL      = "ftp://cs.uta.fi/pub/ssjaaa/";

    $OUTPUT_AUTOFLUSH = 1;
}


# ***************************************************************** &help ****
#
#   DESCRIPTION
#
#       Print help and exit.
#
#   INPUT PARAMETERS
#
#       $msg    [optional] Reason why function was called.
#
#   RETURN VALUES
#
#       none
#
# ****************************************************************************

=pod

=head1 NAME

mime-multipart.pl - Re-assemble multipart MIME email messages from mailboxes.

=head1 SYNOPSIS

    mime+.pl -b mailbox mailbox ..

=head1 OPTIONS

=head2 General options

=over 4

=item B<--base64> B<-b>

Normally all the lines in multipart message is saved. This options says
that only base64 encoded lines are saved. See better description later.

=back

=head2 Miscellaneous options

=over 4

=item B<--debug> B<-d>

Turn on debug.

=item B<--help> B<-h>

Print help page.

=item B<--verbose> [LEVEL] B<-v> [LEVEL]

Turn on verbose messages. If you give optional level 2, you get a bit
more messages. Eg. If you suspect that not all base64 lines were saved,
this level will print enough information to decide yourself.
Defualt verbose level is 1.

=item B<--Version> B<-V>

Print program version and contact info.

=back

=head1 README

This program assembles MIME partial mail messages back. All
of the I<bodies> for the same part id are saved to one file. If you
only want to save the base64 lines from the bodies, then use B<--base64>
switch.

The format of the file must be following: The beginning of header is
indicated with the field "From", which must match "@" and year
"[0-9][0-9][0-9][0-9]". This loosely matches typical Berkley mailbox format
which starts with line:

    From foo@bar.com  Mon May 25 14:51:28 1998

But is is also allowed to start like this, as Emacs Gnus newsreader
converts the incoming From_ field to X-From-Line.

    X-From-Line: foo@bar.com  Mon May 25 14:51:28 1998

There must also be header Content-Type which defines part id and number.
The assembled mesages are saved according to part id. Message normally has
it if it is correctly MIME encapsulted. Below you see example
of the required minimum headers:

    X-From-Line: foo\@bar.com  Mon May 25 14:51:28 1998
    Content-Type: message/partial; id="Mon_May_25_14:46:46_1998\@foo.bar.com"; number=2; total=8

=head2 Saving base64 lines only

When you send message as binary multipart, the body will contains extra
lines, like mime headers. If you turn on the B<--base64> switch, then only
the encoded lines from the body are written to file. This way you can decode
the file with

    % mmencode -u BODY > binary.tar.gz

Or if you want to decode everything in one pass

    % mmencode -u BODY | gzip -dz | tar -xvf -

And example of the multipart binary looks like this: the binary file
has been gzipped and base64 encoded here (notice mime type x-gzip):

    From: <foo\@bar.com>
    To: quux\@bar.com
    Subject: test
    Mime-Version: 1.0 (generated by tm-edit 7.106)
    Content-Type: message/partial; id="Mon_May_25_16:32:50_1998\@foo.bar.com"; number=2; total=16
    Content-Transfer-Encoding: 7bit

    --Multipart_Mon_May_25_16:32:45_1998-1
    Content-Type: text/plain; charset=US-ASCII

    --Multipart_Mon_May_25_16:32:45_1998-1
    Content-Type: application/octet-stream
    Content-Disposition: attachment; filename="binary_name"
    Content-Transfer-Encoding: x-gzip64

    H4sIAJRxaTUAA8ycf1hc1ZnHz8zAzPAjmYEMMDADcwcuvxJMJgZNrJgMCZOQFQMNJKJmN9Rg
    ...
    --Multipart_Mon_May_25_16:32:45_1998-1--

=head1 NOTES

This program will I<not> check if the mailboxes files contain all part fot
the distributed file. It simply concatenates all partials that belong to
same part id together. Turn on the B<--verbose> to see what parts is did
found from mailboxes.

All mailboxes are read to memory before writing compbined partials to
files. Make sure you have enough memory.

=head1 SEE ALSO

splitmail(1) to send out multipart MIME messages

=head1 AVAILABILITY

CPAN entry is http://www.perl.com/CPAN-local//scripts/
Reach author at jari.aalto@poboxes.com


=head1 SCRIPT CATEGORIES

CPAN/Administrative

=head1 PREREQUISITES

No CPAN modules required.

=head1 COREQUISITES

No optional CPAN modules needed.

=head1 OSNAMES

C<any>

=head1 VERSION

$Id: mime-multipart.pl,v 1.2 1999/02/08 19:22:44 jaalto Exp $

=head1 AUTHOR

(C) 1998-1999 Jari Aalto. All rights reserved. This program is free
software; you can redistribute it and/or modify it under the same terms as
Perl itself or in terms Gnu General Public licence v2 or later.

=cut

sub Help (;$)
{
    my $id  = "$LIB.Help";
    my $msg = shift;  # optional arg, why are we here...

    pod2text $PROGRAM_NAME;

    exit 1;
}

# ************************************************************** &args *******
#
#   DESCRIPTION
#
#       Read and interpret command line arguments
#
#   INPUT PARAMETERS
#
#       none
#
#   RETURN VALUES
#
#       none
#
# ****************************************************************************

sub HandleCommandLineArgs ()
{
    my $id = "$LIB.HandleCommandLineArgs";

    my $VERSION_OPTION;

    # .......................................... command line options ...

    use vars qw
    (
	$HELP
	$VERSION_OPTION

	$base64
	$debug
	$verb
    );

    # .................................................... read args ...

    Getopt::Long::config( qw
    (
	ignore_case
        require_order
    ));

    GetOptions      # Getopt::Long
    (
          "h|help"    => \$HELP
	, "base64"    => \$base64
	, "verbose:i" => \$verb
	, "Version"   => \$VERSION_OPTION
	, "debug"     => \$debug
    );

    $VERSION_OPTION and die "$VERSION $PROGNAME $CONTACT $URL\n";
    $HELP	    and Help();
    $verb = 1       if $debug;

    #	If user gave plain -v, then $verb got defined, but it has no value
    #	due to :i spec.

    $verb = 1	    if defined $verb;

    $base64 and $verb and warn "BASE64 activated\n";
}


# ****************************************************************************
#
#   DESCRIPTION
#
#	Delete list of files. Check if file exists before deleting it.
#
#   INPUT PARAMETERS
#
#	@array	    filenames
#
#   RETURN VALUES
#
#       none
#
# ****************************************************************************

sub DeleteFiles (@)
{
    my $id    = "$LIB.DeleteFiles";
    my @files = @ARG;

    local $ARG;

    for ( @files )
    {
	if ( -e )
	{
	    ($verb or $debug)  and  print "$id: $ARG\n";
	    unlink;
	}
    }
}

# ****************************************************************************
#
#   DESCRIPTION
#
#	Delete suspicious characters out of string. The result should be
#	suitable filename
#
#   INPUT PARAMETERS
#
#	$string
#
#   RETURN VALUES
#
#       $string
#
# ****************************************************************************

sub CleanString ($)
{
    my    $id  = "$LIB.ConvertString";
    local $ARG = shift;

    my $regexp = '[][*(){}<>\\!@#$%^&\s]';
    my $ret    = $ARG;

    if ( /$regexp/o  )
    {
	( $ret = $ARG ) =~  s/$regexp//go;
	$debug and print "$id: $ARG --> $ret ";
    }

    $ret;
}

# ****************************************************************************
#
#   DESCRIPTION
#
#	Sort partials and save add ">>" each line to own file. Remember
#	to delete old existing files first before calling this function.
#
#	Note: file is opened and closed after every line write. This
#	could be optimized into openeing file only when the previous
#	'id' (filename) has changed. The lines are already sorted by id.
#
#   INPUT PARAMETERS
#
#	\%lineHash  'id#partNumber#LineNumber' -- line
#	$base64	    if non-zero, then save only base64 lines.
#
#   RETURN VALUES
#
#       none
#
# ****************************************************************************

sub WritePartials ($$)
{
    my $id 			= "$LIB.WritePartials";
    my( $lineHashRef, $base64 ) = @ARG;

    local *F;
    my    ($count, $keyId, %hash );


    # base 64 block has exactly 72 characters
    # Pt61nMgR389xHaBxOZtpXO4HNM5HfG7vI8Z738s0TiM+vwx+Q+OKiPFeJ8aXYmI8LSE+vyx5
    #
    #	base64regexp = [+/=a-zA-Z0-9]

    for ( sort keys %$lineHashRef )
    {
	($keyId) = /^([^#]+)/;		# first string from key
	$keyId eq ''  and die "Can't parse line: $ARG\n";

	$ARG  = $lineHashRef->{$ARG};
	$count++;

	#   Each line may go to separate files

	if ( $base64 )
	{
	    #	Reject non-base64 lines

	    next if  /^\s*$/;

	    if ( /([-:._<>(){}!@#%^&*?\\ \t\f\b\$])/  )
	    {
		$verb > 1 and print "$id: base64 Reject '$1' $ARG";
		next;
	    }

	    $hash{ $keyId } .= $ARG;
	    print F;

	    $verb and ($count % 1000 ) == 0  and print "$count ";
	}
	else
	{
	    $verb and ($count % 1000 ) == 0  and print "$count ";
	    $hash{ $keyId } .= $ARG;
	}
    }

    for ( keys %hash )
    {
	open  F, ">>$ARG"		or die "$id: $ARG $ERRNO";
	print F  $hash{ $ARG };
	close F;
    }


    $verb and print "\n";	# terminate output
}


# ****************************************************************************
#
#   DESCRIPTION
#
#       Read all mailboxes and find partial messages from there.
#       The MIME_ID string is used as filename.
#
#   INPUT PARAMETERS
#
#	@array	    List of mailboxes to read
#
#   RETURN VALUES
#
#	\%lineHash  'id#partNumber#LineNumber' -- line
#
#	    The Hash key is combined string, where the ID is same for all part
#	    numbers. The partNumber always has 3 numbers with leading zeroes.
#	    The LineNumber has 5 digits with leading zeroes, so that an example
#	    key would look like
#
#		'Mon_May_25_16:32:50_1998@foo.bar.com#002#00001'
#
#	    The `line' is read body line from partial. NOTE: LineNumber
#	    is a running number and does not get resetted between partials.
#
#	\%idHash  'id' -- 1
#
#	    List of found mime_id's fromt he messages.
#
#	\%attachementHash 'id' -- attachementFilename
#
#	    If there was match filename="xxx" the attachementFilename
#	    will contain xxx.
#
# ****************************************************************************

sub ReadMailboxes (@)
{
    my $id = "$LIB.ReadMailboxes";
    local( @ARGV ) = @ARG;

    my( $head, $body, $partNumber, $counter, $tmp, $line , $prev , $idStr );

    my $regexp;

    my $re1     = '^(\S+From(: |\S+: )|From ).*\w+.*\d\d\d\d';
    my $re2     = '^From:? +';

    #  Returned variables

    my %lineHash;
    my %idHash;
    my %attachementHash;

    $debug  and print "$id: Reading: [@ARGV]\n";


    while ( <> )
    {

        # ............................................ find out type ...
	#	Detect the head of the message: "From" line

	if ( $regexp eq '' and /$re1/o )
	{
	    #	Standard MTA, berkey format, which had fdate infomation
	    $regexp = $re1;
	    $verb and print "$id: Using Standard MTA/Berkley regexp.\n";
	}

	if ( $regexp eq '' and /$re2/o and $prev =~ /^\s*$/ )
	{
	    #	Maybe just "\nFrom: foo@bar"

	    $regexp = $re2;
	    $verb and print "$id: Using simpe From regexp.\n";
	}

	$prev = $ARG;

        # ........................................... start matching ...
	#  When we know the Strart regexp, start matching messaeg start

        if ( $regexp ne '' and /$regexp/ )
	{
	    $head  = 1;
	    $body  = 0;
	    $idStr = "";

	    $debug and print  "$id: $regexp >> $ARG";

	}

	if ( $head and  /^\s*$/ )	    # When header ends, raise flag
	{
	    $body = 1; $head = 0;
	    next;
	}

        # .................................. read id and part-number ...

	if ( $head
	     and m
	     {
		^Content-Type:\s+message/partial;\s+id=\"([^\"]+)
		.*number\s*=\s*(\d+)
	      }xi
	   )
	{
	    $idStr = CleanString $1;

	    $partNumber  = sprintf "%04d", $2;

	    $debug and warn "$id: MIME_ID = $idStr $partNumber\n";

	    $verb and print "\n$id: $ARG";

	    $idHash{ $idStr } = 1;
	}

	#	Write body to the array

	if ( $body  and  $idStr ne '' )
	{

	    #	The first partial contains information abount the
	    #	attachement name if any. Try to grab it.

	    if ( $partNumber == 1
		 and m
		 {
		    Content-Disposition:\s*attachment;
		    \s*filename\s*=\s*\"?([^\"\n]+)
		 }xi
	       )
	    {
		$attachementHash{ $idStr } = $1;
		$verb  and print "$id: ATTACHEMENT: $1\n";
	    }


	    $counter++;
	    $tmp = sprintf "%05d", $counter;

	    $line = "$idStr#$partNumber$tmp";

	    $lineHash{ $line } = $ARG;

	    $debug and print "$id: $line $ARG";
	}
    }


    ( \%lineHash, \%idHash, \%attachementHash );
}


# .............................................................. main ...

    Initialize();
    HandleCommandLineArgs();

    my $id = "$LIB.main";

    unless ( @ARGV )
    {
	push @ARGV, "-";
	$verb and print "$id: reading stdin\n";
    }


    my( $lineHashRef, $idHashRef, $attachementHashRef) = ReadMailboxes @ARGV;
    my @files = keys %$idHashRef;

    unless ( @files )
    {
	print "$id: No multiparts found.\n";
	exit 0;
    }

    $verb and print "$id: Sorting partials and assembling....\n";

    #   Remove old files. Next command will ">>" to a file

    DeleteFiles @files;
    WritePartials $lineHashRef, $base64;


    print "$id: assembled partials to files:\n\n";

    for ( sort @files )
    {
	printf "%-50s %s\n",  $ARG, $attachementHashRef->{$ARG};
    }

    print "\n";

0;
__END__