Pimp My HTML

#!\perl\bin\perl

# Durchsucht einen Verzeichnisbaum nach *.html, *.htm oder *.jsp Files
# Verschönert den HTML-Code dieser Files in zwei Durchgängen
# 1. Durchgang: HTML Tidy
#   - DOCTYPE HTML 4.01 transitional
#   - kleingeschriebene Element- und Attributnamen, Attribute in ""
#   - Ausgabe in UTF-8
#   - Umlaute nicht als Entität, sondern direkt (Text bleibt somit lesbar)
#   - Zeilenumbrüche einheitlich
# 2. Durchgang: HTML::Parser
#   - <body> ersetzen durch <body><div id="content"> (passend für CSS)
#   - Umbrüche in Kommentaren bereinigen (macht Tidy nicht)
#   - <meta name="generator"> (auch mehrfache) entfernen

use strict;
use warnings;

use HTML::Parser;
use File::Iterator;

iterate('\astrotexte\sources_bak', \&pimp );
#iterate('\astrotexte\sources', sub {print "$1\n" if (shift =~ /\\astrotexte\\sources\\(.*)/) ; } );

sub iterate {
  my ($startdir,$do) = @_;
  my ($it,$file);
  $it = new File::Iterator(
             DIR     => $startdir,
             RECURSE => 0,
             FILTER  => sub { $_[0] =~ /Spec\.(html?|jsp)$/ },
          );

  while ($file = $it->next()) {
   &$do($file);
   }

}

sub pimp {

  my $source = shift;
  my $target = $source;
  $target =~ s/sources_bak/sources/;
  my $tmpfile = "\\astrotexte\\sources\\_temp";
  my $options;

  print "$source";

# Erster Durchlauf: Mit HTML Tidy

  ( $options = <<OPTIONS ) =~ s/\s*(#[^\n]*)?\n/ /gm;
-o "$tmpfile"      # In temporäres File schreiben
-i                 # indent = Einrückungen
-w 80              # Line wrap bei ca. Position 80
--doctype loose    # HTML 4.01, Tidy erzeugt keine URL der DTD (ist aber OK)
--join-classes 1   # Mehrere CSS-Class-Attributes zu einem zusammenfassen
--output-encoding UTF8  # Ausgabe in Unicode UTF-8
--quiet 1          # Nicht schwätzen
OPTIONS

  `\\tidy\\tidy $options "$source" 2>NUL`;

  print ".";

# Zweiter Durchlauf: Nun mit HTML::Parser

# Immer nur <body> oder <body onload="">, alle anderen Attribute entfernen
# <body>  -->  <body><div id="content">
# </body> -->  </div></body>
# <meta name="generator"> (auch mehrfache) entfernen


  open(RESULT, ">:utf8", $target)  or die "$! - Can't open $target for write";

  my $p = HTML::Parser->new(
    default_h => [ \&handle_default, 'text'],
    comment_h => [ \&handle_comment, 'text'],
    start_h   => [ \&handle_start,   'tagname,attr,text'],
    end_h     => [ \&handle_end,     'tagname'],
    text_h    => [ \&handle_text,    'text'],
    );


  open(my $fh, "<:utf8", $tmpfile ) || die $!;
  $p->parse_file($fh) || die $!;
  close $fh;
  close RESULT;

  print ".\n";

}

{

# Flags mit eingeschränktem Geltungsbereich
my $body = 0;

sub handle_start {

  my ($tagname,$attr,$text) = @_;
  my $att ="";

  if ($tagname eq "body") {
    $att =qq(onload="$attr->{onload}") if $attr->{onload};
    print RESULT qq(<body $att><div id="content">);
    $body = 1;
    }
  elsif ($tagname eq "meta"
           and $attr->{name}
           and $attr->{name} =~ /generator/i) {
    return;  # Generator-Tag unterdrücken
    }
  elsif ($tagname eq "script") {
    handle_default( $text );
    print RESULT "</script>";
    }
  else {
    handle_default( $text );
    }

  }

sub handle_end {
  my ($tagname) = @_;
  if ($tagname eq 'body') {
    print RESULT qq(</div></body>);
    $body = 0;
    }
  elsif ($tagname eq 'script') {
    return;  # Machen wir schon in handle_start
    }
  else {
    print RESULT qq(</$tagname>);
    }
  }

sub handle_text {
  my $text = shift;

# Im Headerbereich mehrfache Zeilenumbrüche entfernen
  $text =~ s/(  )?\n(  )?\n/\n/gm unless $body;

  print RESULT $text;

  }

sub handle_default {
  print RESULT shift;
  }

# Tidy bearbeitet keine 0D/0A-Probleme innerhalb von Comments
sub handle_comment {
  my $text = shift;
  $text =~ s/\r\n/\n/gm;
  $text =~ s/\n\n/\n/gm;
  print RESULT $text;
  }

}