#!/usr/bin/perl -w # Exgen 1.0 -- A script to generate an XML parser from an XML document instance ############################################################################### # Module Name : exgen.pl # Version : 1.0 # Description : This application contains all of the subroutines necessary # : generates a perl EXPAT parser from an XML file # Author : Jeremy Ellman # # Copyright (c) 2004, Jeremy Ellman. All Rights Reserved. # This module is free software. It may be used, redistributed # and/or modified under the terms of the Perl Artistic License # (see http://www.perl.com/perl/misc/Artistic.html) # ################ Change History ################# # # 10/12/03: Re-written parser to use EXPAT rather than twig parser # ################End Change History#################################### # use strict; use utf8; use Getopt::Long; #use Data::Dumper; #print Dumper($reference); my %tags = (); # my $debug = 0; # sub main { my ($line, $first, $rest); my $count = 0; my $infile; my $outfile; GetOptions( 'xmldoc=s' => \$infile, 'parser=s' => \$outfile, 'debug' => sub { $debug = 1 }); die "exgen --infile --outfile [--debug]\n\n Stopping " unless defined $infile && defined $outfile && $infile && $outfile; print "IN: $infile, OUT: $outfile, DEBUG: $debug\n"; my $debug_mode = 1; open (IN, $infile ) || die "Can't read $infile"; open (OUT, ">" . $outfile) || die "Can't write $outfile"; &XML_read(); &XML_write(); $infile = quotemeta $infile; print OUT "\n\n&run_generated_parser( \"$infile\") if \$debug;"; } # Read an XML file instance noting tags and their attributes # sub XML_read { my $line; my $first; my $count = 0; while ($line = ) { $line =~ s/[\n\r]/ /g; print "$count: >>$line<<\n" if $debug; $count++; while ($line =~ /^.*?<(.+?)>(.*)/) { print "Found: $1 (rest: >>$2<<)\n" if $debug; $first = $1; &process_tag( $first ); if (defined $2) { $line = $2; } else { last; } } } print "$count lines processed\n"; } # Process a tag. We can automatically generate close tags, so we're only # interested in open tags. For these, we need to check for attributes... # sub process_tag { my $tag_line = shift; my $tag; my $rest; my $tag_value = {} ; $tag_line =~ /^([\w_]+)\W+(.*)/; $tag = $1; $rest = $2; if ($tag =~ /^\w/) { # it's an open tag print "$tag STORED\n" if $debug; my $tv = $tags{ $tag }; $tag_value = $tv if defined $tv; # get old tags value while ($rest =~ /^\s*(\w+).*?=\s*\"(.+?)\"(.*)/) { my $attribute = $1; print "Attribute: $attribute\n" if $debug; $tag_value->{ $attribute } = 1; $rest = $3; } $tags{ $tag } = $tag_value; # print Dumper(\%tags); } } # Write out the Subs, an open and a close for each XML tag. # sub XML_write { # my $file = shift; &print_header (); for my $tag (sort keys %tags) { print "Processing: $tag\n" if $debug; &print_open_callback( $tag ); &print_close_callback( $tag ); } } # Write out the parser headings. sub print_header { print OUT "# Generated (in part) by Jeremy Ellman's EXGen\n"; print OUT "# -- for efficiency, delete the callbacks you don't need\n\n"; print OUT "use XML::Parser;\nuse strict;\n"; print OUT "\n\nmy \$debug = 1;\nmy \$text = \"\";\n"; print OUT 'sub run_generated_parser { my ($infile) = @_; # select parse call backs my $parser = new XML::Parser(\'Style\' => \'Subs\'); $parser->setHandlers(Char => \\&text_handler); if ($parser->parsefile( $infile )){ print "Parse Completed\n"; } else { if ($@ =~ /line\\s*(\\d+),/) { print "Bad_Data: >>$1<< \n"; } else { print "Illegal XML file: $infile \n"; } } } sub text_handler { my ($Expat, $String) = @_; $text .= $String if $String =~ /\\w/; }'; } # Write the OPEN sub for the XML tag # sub print_open_callback { my $tag = shift; print "OPEN: $tag\n" if $debug; print OUT "\n\nsub $tag { my( \$expat, \$element, \%", "$tag", "_hh) = \@_; my \$$tag = \\\%", "$tag", "_hh;\n print \"\\nopen: $tag\\n\" if \$debug; "; my $attributes = $tags{ $tag }; if (defined $attributes) { for my $attribute (sort (keys %{ $attributes })) { print "ATTR: $attribute\n" if $debug; print OUT "\n\tmy \$$attribute = \$$tag->\{ $attribute \};"; } for my $attribute (sort (keys %{ $attributes })) { print OUT "\n\tprint \"ATTRIBUTE ($tag) $attribute: \$$attribute\\n\" if \$debug && defined \$$attribute;"; } } print OUT "\n}\n"; } sub print_close_callback { my $tag = shift; print "CLOSE: $tag\n" if $debug; print OUT "\n\nsub $tag", "_ { my( \$expat, \$element) = \@_; print \"close: $tag\", ((\$text) ? \", Text: \\\"\$text\\\"\\n\" : \"\\n\") if \$debug; \$text = \"\"; }" } &main(); 1; =head1 NAME Exgen 1.0 -- A script to generate an XML parser from an XML document instance =head1 DESCRIPTION Exgen takes an XML document instance and creates a 'stubs' style XML parser from it. This parser may then be adapted as necessary. This greatly reduces the work needed in to create an XML parser. Exgen is also useful where there is no access to the relevant DTD or schema. The design objective of Exgen is that it is less tedious to delete callbacks that are not needed than to generate them by hand. Exgen may also be a help for novice XML parser writers. =head1 Synopsis perl -w exgen_1_0.pl --xmldoc myxml.xml --parser myparser.pl perl -w myparser.pl myxml.xml =head1 PREREQUISITES This script requires Getopt::Long =head1 COREQUISITES The generated XML parser requires XML::Parser =head1 Limitations Namespaces are not supported yet. =head1 AUTHOR Jeremy Ellman (jeremy at ellman dot freeserve dot co dot uk) =pod OSNAMES any =cut =pod SCRIPT CATEGORIES CPAN/Administrative Fun/Educational =cut