#!/usr/local/bin/perl # # @(#) Perl -- Re-assemble multipart mime email messages from mailboxes. # @(#) $Id: mime-multipart.pl,v 1.2 1999/02/08 19:22:44 jaalto Exp $ # # File id # # .Copyright (C) 1998-1999 Jari Aalto # .Created: 1998-05 # .$Contactid: $ # .$Keywords: Perl, merge, mail, mime, partials $ # .$Url: http://www.netforward.com/poboxes/?jari.aalto $ # .$Perl: 5.004 $ # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of # the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, # Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # About program layout # # Code written with Unix Emacs and indentation controlled with # Emacs package tinytab.el, a generic tab minor mode for programming. # # The {{ }}} marks you see in this file are party of file "fold" # control package called folding.el (Unix Emacs lisp package). # ftp://ftp.csd.uu.se/pub/users/andersl/beta/ to get the latest. # # There is also lines that look like # ....... &tag ... and they # are generated by Emacs Lisp package `tinybm.el', which is also # document structure tool. You can jump between the blocks with # Ctrl-up and Ctrl-down keys and create those "bookmarks" with # Emacs M-x tibm-insert. See www contact site below. # # Funny identifiers at the top of file # # The GNU RCS ident(1) program can print useful information out # of all variables that are in format $ IDENTIFIER: text $ # See also Unix man pages for command what(1) which outputs all lines # matching @( # ). Try commands: # # % what PRGNAME # % ident PRGNAME # # Introduction # # Please start this perl script with options # # --help to get the help page # # Www contact site # # See http://www.netforward.com/poboxes/?jari.aalto and navigate # to html pages in the site to get more information about me # and my tools (Emacs, Perl, procmail mostly) # # Description # # This script reads mailboxes and moulds MIME multipart messages # to single files. The mltiparts may be spread over any number # of files. The assembling happens in memory, so prepare to have # plenty of if you are going to assemble big tar.gz kit multiparts. # # Change Log # # (none) BEGIN { require 5.003 } use integer; use strict; use Env; use English; use Getopt::Long; use vars qw ( $VERSION ); # This is for use of Makefile.PL and ExtUtils::MakeMaker # So that it puts the tardist number in format YYYY.MMDD # The REAL version number is defined later # The following variable is updated by my Emacs setup whenever # this file is saved $VERSION = '1999.0208'; # **************************************************************************** # # DESCRIPTION # # Set global variables for the program # # INPUT PARAMETERS # # none # # RETURN VALUES # # none # # **************************************************************************** sub Initialize () { use vars qw ( $PROGNAME $LIB $FILE_ID $VERSION $CONTACT $URL ); $PROGNAME = "mime+.pl"; # Hard-coded. Not looked from $0 $LIB = $PROGNAME; # library where each function belongs: PRGNAME $FILE_ID = q$Id: mime-multipart.pl,v 1.2 1999/02/08 19:22:44 jaalto Exp $; $VERSION = (split ' ', $FILE_ID)[2]; # version number in format N.NN+ $CONTACT = ""; # Who is the maintainer $URL = "ftp://cs.uta.fi/pub/ssjaaa/"; $OUTPUT_AUTOFLUSH = 1; } # ***************************************************************** &help **** # # DESCRIPTION # # Print help and exit. # # INPUT PARAMETERS # # $msg [optional] Reason why function was called. # # RETURN VALUES # # none # # **************************************************************************** =pod =head1 NAME mime-multipart.pl - Re-assemble multipart MIME email messages from mailboxes. =head1 SYNOPSIS mime+.pl -b mailbox mailbox .. =head1 OPTIONS =head2 General options =over 4 =item B<--base64> B<-b> Normally all the lines in multipart message is saved. This options says that only base64 encoded lines are saved. See better description later. =back =head2 Miscellaneous options =over 4 =item B<--debug> B<-d> Turn on debug. =item B<--help> B<-h> Print help page. =item B<--verbose> [LEVEL] B<-v> [LEVEL] Turn on verbose messages. If you give optional level 2, you get a bit more messages. Eg. If you suspect that not all base64 lines were saved, this level will print enough information to decide yourself. Defualt verbose level is 1. =item B<--Version> B<-V> Print program version and contact info. =back =head1 README This program assembles MIME partial mail messages back. All of the I for the same part id are saved to one file. If you only want to save the base64 lines from the bodies, then use B<--base64> switch. The format of the file must be following: The beginning of header is indicated with the field "From", which must match "@" and year "[0-9][0-9][0-9][0-9]". This loosely matches typical Berkley mailbox format which starts with line: From foo@bar.com Mon May 25 14:51:28 1998 But is is also allowed to start like this, as Emacs Gnus newsreader converts the incoming From_ field to X-From-Line. X-From-Line: foo@bar.com Mon May 25 14:51:28 1998 There must also be header Content-Type which defines part id and number. The assembled mesages are saved according to part id. Message normally has it if it is correctly MIME encapsulted. Below you see example of the required minimum headers: X-From-Line: foo\@bar.com Mon May 25 14:51:28 1998 Content-Type: message/partial; id="Mon_May_25_14:46:46_1998\@foo.bar.com"; number=2; total=8 =head2 Saving base64 lines only When you send message as binary multipart, the body will contains extra lines, like mime headers. If you turn on the B<--base64> switch, then only the encoded lines from the body are written to file. This way you can decode the file with % mmencode -u BODY > binary.tar.gz Or if you want to decode everything in one pass % mmencode -u BODY | gzip -dz | tar -xvf - And example of the multipart binary looks like this: the binary file has been gzipped and base64 encoded here (notice mime type x-gzip): From: To: quux\@bar.com Subject: test Mime-Version: 1.0 (generated by tm-edit 7.106) Content-Type: message/partial; id="Mon_May_25_16:32:50_1998\@foo.bar.com"; number=2; total=16 Content-Transfer-Encoding: 7bit --Multipart_Mon_May_25_16:32:45_1998-1 Content-Type: text/plain; charset=US-ASCII --Multipart_Mon_May_25_16:32:45_1998-1 Content-Type: application/octet-stream Content-Disposition: attachment; filename="binary_name" Content-Transfer-Encoding: x-gzip64 H4sIAJRxaTUAA8ycf1hc1ZnHz8zAzPAjmYEMMDADcwcuvxJMJgZNrJgMCZOQFQMNJKJmN9Rg ... --Multipart_Mon_May_25_16:32:45_1998-1-- =head1 NOTES This program will I check if the mailboxes files contain all part fot the distributed file. It simply concatenates all partials that belong to same part id together. Turn on the B<--verbose> to see what parts is did found from mailboxes. All mailboxes are read to memory before writing compbined partials to files. Make sure you have enough memory. =head1 SEE ALSO splitmail(1) to send out multipart MIME messages =head1 AVAILABILITY CPAN entry is http://www.perl.com/CPAN-local//scripts/ Reach author at jari.aalto@poboxes.com =head1 SCRIPT CATEGORIES CPAN/Administrative =head1 PREREQUISITES No CPAN modules required. =head1 COREQUISITES No optional CPAN modules needed. =head1 OSNAMES C =head1 VERSION $Id: mime-multipart.pl,v 1.2 1999/02/08 19:22:44 jaalto Exp $ =head1 AUTHOR (C) 1998-1999 Jari Aalto. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself or in terms Gnu General Public licence v2 or later. =cut sub Help (;$) { my $id = "$LIB.Help"; my $msg = shift; # optional arg, why are we here... pod2text $PROGRAM_NAME; exit 1; } # ************************************************************** &args ******* # # DESCRIPTION # # Read and interpret command line arguments # # INPUT PARAMETERS # # none # # RETURN VALUES # # none # # **************************************************************************** sub HandleCommandLineArgs () { my $id = "$LIB.HandleCommandLineArgs"; my $VERSION_OPTION; # .......................................... command line options ... use vars qw ( $HELP $VERSION_OPTION $base64 $debug $verb ); # .................................................... read args ... Getopt::Long::config( qw ( ignore_case require_order )); GetOptions # Getopt::Long ( "h|help" => \$HELP , "base64" => \$base64 , "verbose:i" => \$verb , "Version" => \$VERSION_OPTION , "debug" => \$debug ); $VERSION_OPTION and die "$VERSION $PROGNAME $CONTACT $URL\n"; $HELP and Help(); $verb = 1 if $debug; # If user gave plain -v, then $verb got defined, but it has no value # due to :i spec. $verb = 1 if defined $verb; $base64 and $verb and warn "BASE64 activated\n"; } # **************************************************************************** # # DESCRIPTION # # Delete list of files. Check if file exists before deleting it. # # INPUT PARAMETERS # # @array filenames # # RETURN VALUES # # none # # **************************************************************************** sub DeleteFiles (@) { my $id = "$LIB.DeleteFiles"; my @files = @ARG; local $ARG; for ( @files ) { if ( -e ) { ($verb or $debug) and print "$id: $ARG\n"; unlink; } } } # **************************************************************************** # # DESCRIPTION # # Delete suspicious characters out of string. The result should be # suitable filename # # INPUT PARAMETERS # # $string # # RETURN VALUES # # $string # # **************************************************************************** sub CleanString ($) { my $id = "$LIB.ConvertString"; local $ARG = shift; my $regexp = '[][*(){}<>\\!@#$%^&\s]'; my $ret = $ARG; if ( /$regexp/o ) { ( $ret = $ARG ) =~ s/$regexp//go; $debug and print "$id: $ARG --> $ret "; } $ret; } # **************************************************************************** # # DESCRIPTION # # Sort partials and save add ">>" each line to own file. Remember # to delete old existing files first before calling this function. # # Note: file is opened and closed after every line write. This # could be optimized into openeing file only when the previous # 'id' (filename) has changed. The lines are already sorted by id. # # INPUT PARAMETERS # # \%lineHash 'id#partNumber#LineNumber' -- line # $base64 if non-zero, then save only base64 lines. # # RETURN VALUES # # none # # **************************************************************************** sub WritePartials ($$) { my $id = "$LIB.WritePartials"; my( $lineHashRef, $base64 ) = @ARG; local *F; my ($count, $keyId, %hash ); # base 64 block has exactly 72 characters # Pt61nMgR389xHaBxOZtpXO4HNM5HfG7vI8Z738s0TiM+vwx+Q+OKiPFeJ8aXYmI8LSE+vyx5 # # base64regexp = [+/=a-zA-Z0-9] for ( sort keys %$lineHashRef ) { ($keyId) = /^([^#]+)/; # first string from key $keyId eq '' and die "Can't parse line: $ARG\n"; $ARG = $lineHashRef->{$ARG}; $count++; # Each line may go to separate files if ( $base64 ) { # Reject non-base64 lines next if /^\s*$/; if ( /([-:._<>(){}!@#%^&*?\\ \t\f\b\$])/ ) { $verb > 1 and print "$id: base64 Reject '$1' $ARG"; next; } $hash{ $keyId } .= $ARG; print F; $verb and ($count % 1000 ) == 0 and print "$count "; } else { $verb and ($count % 1000 ) == 0 and print "$count "; $hash{ $keyId } .= $ARG; } } for ( keys %hash ) { open F, ">>$ARG" or die "$id: $ARG $ERRNO"; print F $hash{ $ARG }; close F; } $verb and print "\n"; # terminate output } # **************************************************************************** # # DESCRIPTION # # Read all mailboxes and find partial messages from there. # The MIME_ID string is used as filename. # # INPUT PARAMETERS # # @array List of mailboxes to read # # RETURN VALUES # # \%lineHash 'id#partNumber#LineNumber' -- line # # The Hash key is combined string, where the ID is same for all part # numbers. The partNumber always has 3 numbers with leading zeroes. # The LineNumber has 5 digits with leading zeroes, so that an example # key would look like # # 'Mon_May_25_16:32:50_1998@foo.bar.com#002#00001' # # The `line' is read body line from partial. NOTE: LineNumber # is a running number and does not get resetted between partials. # # \%idHash 'id' -- 1 # # List of found mime_id's fromt he messages. # # \%attachementHash 'id' -- attachementFilename # # If there was match filename="xxx" the attachementFilename # will contain xxx. # # **************************************************************************** sub ReadMailboxes (@) { my $id = "$LIB.ReadMailboxes"; local( @ARGV ) = @ARG; my( $head, $body, $partNumber, $counter, $tmp, $line , $prev , $idStr ); my $regexp; my $re1 = '^(\S+From(: |\S+: )|From ).*\w+.*\d\d\d\d'; my $re2 = '^From:? +'; # Returned variables my %lineHash; my %idHash; my %attachementHash; $debug and print "$id: Reading: [@ARGV]\n"; while ( <> ) { # ............................................ find out type ... # Detect the head of the message: "From" line if ( $regexp eq '' and /$re1/o ) { # Standard MTA, berkey format, which had fdate infomation $regexp = $re1; $verb and print "$id: Using Standard MTA/Berkley regexp.\n"; } if ( $regexp eq '' and /$re2/o and $prev =~ /^\s*$/ ) { # Maybe just "\nFrom: foo@bar" $regexp = $re2; $verb and print "$id: Using simpe From regexp.\n"; } $prev = $ARG; # ........................................... start matching ... # When we know the Strart regexp, start matching messaeg start if ( $regexp ne '' and /$regexp/ ) { $head = 1; $body = 0; $idStr = ""; $debug and print "$id: $regexp >> $ARG"; } if ( $head and /^\s*$/ ) # When header ends, raise flag { $body = 1; $head = 0; next; } # .................................. read id and part-number ... if ( $head and m { ^Content-Type:\s+message/partial;\s+id=\"([^\"]+) .*number\s*=\s*(\d+) }xi ) { $idStr = CleanString $1; $partNumber = sprintf "%04d", $2; $debug and warn "$id: MIME_ID = $idStr $partNumber\n"; $verb and print "\n$id: $ARG"; $idHash{ $idStr } = 1; } # Write body to the array if ( $body and $idStr ne '' ) { # The first partial contains information abount the # attachement name if any. Try to grab it. if ( $partNumber == 1 and m { Content-Disposition:\s*attachment; \s*filename\s*=\s*\"?([^\"\n]+) }xi ) { $attachementHash{ $idStr } = $1; $verb and print "$id: ATTACHEMENT: $1\n"; } $counter++; $tmp = sprintf "%05d", $counter; $line = "$idStr#$partNumber$tmp"; $lineHash{ $line } = $ARG; $debug and print "$id: $line $ARG"; } } ( \%lineHash, \%idHash, \%attachementHash ); } # .............................................................. main ... Initialize(); HandleCommandLineArgs(); my $id = "$LIB.main"; unless ( @ARGV ) { push @ARGV, "-"; $verb and print "$id: reading stdin\n"; } my( $lineHashRef, $idHashRef, $attachementHashRef) = ReadMailboxes @ARGV; my @files = keys %$idHashRef; unless ( @files ) { print "$id: No multiparts found.\n"; exit 0; } $verb and print "$id: Sorting partials and assembling....\n"; # Remove old files. Next command will ">>" to a file DeleteFiles @files; WritePartials $lineHashRef, $base64; print "$id: assembled partials to files:\n\n"; for ( sort @files ) { printf "%-50s %s\n", $ARG, $attachementHashRef->{$ARG}; } print "\n"; 0; __END__