Advertisement
rplantiko

Extract HTML content from other websites

Dec 10th, 2016
846
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.42 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Extract HTML content from a web site (here: Robert Spencer's list of Islamic terror attacks)
  4. # This page is a slightly changed version of the former pastie.org/1857587
  5. # Reason for change: thereligionofpeace.com blocked requests with unusual user agents with HTTP 403
  6. # Therefore, I had to switch from LWP::Simple to LWP::UserAgent
  7.  
  8. use open IO  => ':utf8';
  9. use utf8;
  10. binmode STDOUT, ":encoding(UTF-8)";
  11.  
  12. use strict;
  13. use CGI::Carp qw(fatalsToBrowser);
  14. use HTML::Parser;
  15. use LWP::UserAgent;
  16.  
  17. my ($printstack,$coreTable) = (0,0);
  18.  
  19. my $p = HTML::Parser->new( start_h => [\&start, "tagname, attr, text"],
  20.                            text_h  => [\&text, "text"] ,
  21.                            end_h   => [\&end, "tagname, text"] );
  22. my $year;
  23.  
  24. print qq(Content-Type:text/xml\n\n);
  25.  
  26. print qq(<?xml version="1.0" encoding="utf-8"?>\n);
  27. print qq(<?xml-stylesheet type="text/xsl" href="/attacks/attacks.xsl" ?>);
  28.              
  29. $year = (localtime)[5]+1900;  
  30. my $content = get( "http://www.thereligionofpeace.com/attacks/attacks.aspx?Yr=$year" );
  31. $p->parse(  $content  );
  32.  
  33. exit 0;
  34.  
  35. # Einen HTTP-Content abholen, dabei User Agent "Mozilla" angeben
  36. sub get {
  37.  my $url = shift;
  38.  my $ua = LWP::UserAgent->new;
  39.  $ua->agent('Mozilla/5.0');
  40.  $ua->timeout(10);
  41.  
  42.  my $response = $ua->get($url);
  43.  
  44.  if ($response->is_success) {
  45.      return $response->decoded_content;
  46.  }
  47.  else {
  48.      die $response->status_line;
  49.  }
  50. }
  51.  
  52. # --- Behandler für öffnende Tags
  53. sub start {
  54.   my ($tagname,$attr,$text) = @_;
  55.   $printstack++ if (($attr->{class} eq "quran-table") or $printstack);
  56.   $coreTable = $printstack if ($printstack and is_core_table($tagname,$attr));  
  57.   print prepareStartTag($tagname,$attr) if inside_core_table($tagname);
  58.   }
  59.  
  60. # --- Ausgabe des öffnenden Tags - ohne Attribute
  61. sub prepareStartTag {
  62.   my ($tagname, $attr) = @_;
  63.   return "<".uc $tagname.">";
  64.   }
  65.  
  66. # --- Textinhalt ausgeben  
  67. sub text {
  68.   print shift if $printstack and $coreTable;
  69.   }  
  70.  
  71. # --- Schliessende Tags  
  72. sub end {
  73.   my ($tagname,$text) = @_;
  74.   print "</".uc $tagname.">" if inside_core_table($tagname);
  75.   $printstack-- if $printstack;
  76.   $coreTable = 0 if ($printstack < $coreTable);
  77.  }
  78.  
  79. sub is_core_table {
  80.   my ($tagname,$attr) = @_;
  81.   return ( ($tagname eq "table") and $attr->{cellpadding} );
  82. }
  83.  
  84. sub inside_core_table {
  85.   my $tagname = shift;
  86.   return ( $coreTable and $printstack and ($tagname ne "font") );
  87. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement