#!/usr/bin/perl -w # char2ent.pl # # Simple utility to convert files with &#ddd; to/from 8bit chars # See usage at end of this file ( or ./char2ent -h ) # PS works only with 8bit chars, not talking about UTF-16 Unicode here # # mode=html (default) # Convert 8bit chars (with high bit set) to html entity &#ddd; # # mode=work: # Convert html entities &#ddd; to the corresponding 8bit char # # Christophe Chisogne use Getopt::Long; use strict; my $PROG = 'char2ent'; # prog name to display my $VERSION = '0.02'; my $DATE = '2003/11/07'; my $BACK = 'bak'; # extension for backup files # vars from CLI options my ($mode, $backup, $confirm, $keep, $version, $help); $mode = 'html'; my $resopt = GetOptions('version|v' => \$version, 'help|h' => \$help, 'mode=s' => \$mode, 'backup|b' => \$backup, 'confirm|c' => \$confirm, 'keep|k' => \$keep, ) or usage(); version() if defined $version; usage() if (@ARGV != 1) || (defined $help); my $conv; if ($mode =~ /html/i) { print "Conversion from 8bit chars to &#ddd; entities\n"; $conv = \&char2ent; } elsif ($mode =~ /work/i) { print "Conversion from &#ddd; entities to 8bit chars\n"; $conv = \&ent2char; } else { usage(); } # Latin1 convert table taken (thanks awk ;-) from # http://www.w3.org/TR/html401/sgml/entities.html # # Portions © International Organization for Standardization 1986 # Permission to copy in any form is granted for use with # conforming SGML systems and applications as defined in # ISO 8879, provided this notice is included in all copies. # warning, case sensitive for matches my %latin1 = ( ' ' => ' ', '¡' => '¡', '¢' => '¢', '£' => '£', '¤' => '¤', '¥' => '¥', '¦' => '¦', '§' => '§', '¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬', '­' => '­', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±', '²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '¶', '·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»', '¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À', 'Á' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Å' => 'Å', 'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê', 'Ë' => 'Ë', 'Ì' => 'Ì', 'Í' => 'Í', 'Î' => 'Î', 'Ï' => 'Ï', 'Ð' => 'Ð', 'Ñ' => 'Ñ', 'Ò' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù', 'Ú' => 'Ú', 'Û' => 'Û', 'Ü' => 'Ü', 'Ý' => 'Ý', 'Þ' => 'Þ', 'ß' => 'ß', 'à' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã', 'ä' => 'ä', 'å' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è', 'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'í' => 'í', 'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò', 'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷', 'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü', 'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ', ); my $ok = 'y'; foreach my $filename (@ARGV) { if (defined $confirm) { print "Convert file [$filename]? [Yn] "; $ok = ; } unless ($ok =~ /n/i) { print "Converting file [$filename]...\n"; convertfile($filename); } } exit 0; # convertfile($filename) sub convertfile { my $filename = shift; my $tmpname = "$filename.$$"; open INFILE, $filename or die "Cant open $filename\n"; open OUTFILE, ">$tmpname" or die "Cant write $tmpname\n"; while () { print OUTFILE &$conv($_); } close INFILE; close OUTFILE; if ($backup) { rename($filename, "$filename.$BACK") or die "Cant backup $filename.$BACK\n"; } rename($tmpname, $filename) or die "Cant write $filename from $tmpname\n"; } # $line2 = char2ent($line) sub char2ent { my $line = shift; $line =~ s/(.)/(ord $1 > 127) ? '&#'.ord($1).';' : $1/ge; $line; } # $line2 = ent2char($line) sub ent2char { my $line = shift; # first change all é etc to &#ddd; unless told otherwise unless (defined $keep) { foreach my $lat_ent (keys %latin1) { $line =~ s/$lat_ent/$latin1{$lat_ent}/ge; } } # then &#ddd; to 8bit char $line =~ s/&#(\d\d\d);/chr($1)/ge; $line; } # version() sub version { print "$PROG v$VERSION, $DATE\n\n"; print "Convert files with 8bit chars to/from &#ddd; entities\n"; print "Can convert &name; entities from latin1 (160-255)\n"; print "\n"; usage(); exit 0; } # usage() sub usage { print <