Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- # Extract HTML content from a web site (here: Robert Spencer's list of Islamic terror attacks)
- # This page is a slightly changed version of the former pastie.org/1857587
- # Reason for change: thereligionofpeace.com blocked requests with unusual user agents with HTTP 403
- # Therefore, I had to switch from LWP::Simple to LWP::UserAgent
- use open IO => ':utf8';
- use utf8;
- binmode STDOUT, ":encoding(UTF-8)";
- use strict;
- use CGI::Carp qw(fatalsToBrowser);
- use HTML::Parser;
- use LWP::UserAgent;
- my ($printstack,$coreTable) = (0,0);
- my $p = HTML::Parser->new( start_h => [\&start, "tagname, attr, text"],
- text_h => [\&text, "text"] ,
- end_h => [\&end, "tagname, text"] );
- my $year;
- print qq(Content-Type:text/xml\n\n);
- print qq(<?xml version="1.0" encoding="utf-8"?>\n);
- print qq(<?xml-stylesheet type="text/xsl" href="/attacks/attacks.xsl" ?>);
- $year = (localtime)[5]+1900;
- my $content = get( "http://www.thereligionofpeace.com/attacks/attacks.aspx?Yr=$year" );
- $p->parse( $content );
- exit 0;
- # Einen HTTP-Content abholen, dabei User Agent "Mozilla" angeben
- sub get {
- my $url = shift;
- my $ua = LWP::UserAgent->new;
- $ua->agent('Mozilla/5.0');
- $ua->timeout(10);
- my $response = $ua->get($url);
- if ($response->is_success) {
- return $response->decoded_content;
- }
- else {
- die $response->status_line;
- }
- }
- # --- Behandler für öffnende Tags
- sub start {
- my ($tagname,$attr,$text) = @_;
- $printstack++ if (($attr->{class} eq "quran-table") or $printstack);
- $coreTable = $printstack if ($printstack and is_core_table($tagname,$attr));
- print prepareStartTag($tagname,$attr) if inside_core_table($tagname);
- }
- # --- Ausgabe des öffnenden Tags - ohne Attribute
- sub prepareStartTag {
- my ($tagname, $attr) = @_;
- return "<".uc $tagname.">";
- }
- # --- Textinhalt ausgeben
- sub text {
- print shift if $printstack and $coreTable;
- }
- # --- Schliessende Tags
- sub end {
- my ($tagname,$text) = @_;
- print "</".uc $tagname.">" if inside_core_table($tagname);
- $printstack-- if $printstack;
- $coreTable = 0 if ($printstack < $coreTable);
- }
- sub is_core_table {
- my ($tagname,$attr) = @_;
- return ( ($tagname eq "table") and $attr->{cellpadding} );
- }
- sub inside_core_table {
- my $tagname = shift;
- return ( $coreTable and $printstack and ($tagname ne "font") );
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement