#!/usr/bin/perl # # converts the Reuters SGML src into valid XML files. # USAGE: unpack the reuters tar.gz file and then run: # perl 2xml.pl *sgm # use strict; use warnings; use File::Slurp; use File::Path; my $magic_tag = 'reuters'; for my $file (@ARGV) { my $buf = read_file($file); (my $dir = $file) =~ s/\.sgml?$/_xml/; mkpath([$dir], 1); while ($buf =~ m!(<$magic_tag.+?newid="(\d+)".*?>.*?)!sgi) { my $doc = $1; my $id = $2; # get rid of control chars that aren't xml compliant $doc =~ s/&#\d;/\n/gs; $doc =~ s/&#([12]\d|30|31|32);/\n/gs; $doc =~ s/[\x00-\x1f]/\n/gs; # lowercase all tags $doc =~ s/(<.+?>)/lc($1)/ges; # no decls $doc =~ s///sg; my $new = "$dir/$id.xml"; #print "writing doc to $new\n"; write_file($new, $doc); } }