Advertisement
NokitaKaze

Парсер и сабмиттер rss для booru

Jul 5th, 2013
187
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 6.72 KB | None | 0 0
  1. <?php
  2. /*
  3.  http://danbooru.donmai.us/post/atom?tags=
  4.  http://chan.sankakucomplex.com/post/atom?tags=
  5.  http://gelbooru.com/index.php?page=atom&tags=
  6. */
  7.  $id      =(int)($_GET['id']);
  8.  $sad_hide=true;
  9.  
  10.  $q2_c=mysql_fetch_assoc(mysql_query('select * from `'.$sad_prefix.'booru_feed` where `id`='.$id));
  11.  if ((int)$q2_c['id']<1){
  12.   header('http/1.0 404 not found');
  13.   echo 'not found';
  14.   return;
  15.  }
  16.  
  17.  if ((int)($q2_c['ttl'])==0){$q2_c['ttl']=3*3600;}
  18.  if (($q2_c['time']+$q2_c['ttl']>=time() and strlen($q2_c['rss'])>0) or
  19.      (!iam() and ($_SERVER['REMOTE_ADDR']!==my_ip())) ){
  20.   $sad_contype='text/xml';
  21.   echo '<'.'?xml version="1.0" encoding="utf-8"?'.'>';
  22.   echo $q2_c['rss'];
  23.   echo '<!-- cached: '.date('Y-m-d H:i:sO',$q2_c['time']).' -->';
  24.   return;
  25.  }
  26.  
  27.  if (!iam() and ($_SERVER['REMOTE_ADDR']!==my_ip()) and (int)sad_option('dis_booru')==1){header('http/1.0 503');return;}
  28.  
  29.  function numnum($img){
  30.   if (!preg_match('|https?://([0-9a-z.-]+)|',$img,$a)){return 0;}
  31.   if ($a[1]=='danbooru.donmai.us'){return 11;}
  32.   if ($a[1]=='kanaria.ru'){return 10;}
  33.   if ($a[1]=='gelbooru.com'){return 9;}
  34.   if ($a[1]=='chan.sankakucomplex.com'){return 8;}
  35.  
  36.  
  37.  
  38.   return 1;
  39.  }
  40.  
  41.  
  42.  $feed=array();set_time_limit(20*60);$debug=true;
  43.  
  44.  if ($debug){$fo=fopen($sad_prefix.'/debug','a');
  45.   fwrite($fo,date('Y-m-d H:i:s').': start = '.$id."\r\n");fclose($fo);}
  46.  
  47. // Перебираем все дочерные фиды
  48.  $q   =mysql_query('select * from `'.$sad_prefix.'booru_rss` where `parent`='.$id);
  49.  while ($q_c=mysql_fetch_assoc($q)){
  50.   if ($debug){
  51.    $fo=fopen($sad_prefix.'/debug','a');
  52.    fwrite($fo, date('Y-m-d H:i:s').': q_c id = '.$q_c['id'].'; url = "'.
  53.                $q_c['url'].'"; time = '.date('Y-m-d H:iO',$q_c['time'])."\r\n");
  54.    fclose($fo);
  55.   }
  56.  
  57.   //pull a feed
  58.   if ($q_c['time']+6*3600<time() or strlen($q_c['rss'])<1){
  59.    $ch=curl_init($q_c['url']);
  60.    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  61.    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  62.    curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
  63.    $q_c['rss']=curl_exec($ch);
  64.    curl_close($ch);
  65.    mysql_query('update `'.$sad_prefix.'booru_rss` set `time`=UNIX_TIMESTAMP(), `rss`="'.
  66.                sad_safe_mysql($q_c['rss']).'" where `id`='.$q_c['id']);
  67.    
  68.   }
  69.  
  70.   //danbooru pool
  71.   if (preg_match('|^https?://'.$sad_domain.'/booru/danpool\\.php\\?id=([0-9]+)|',$q_c['url'],$a)){
  72.    // Делаем финт ушами и читаем не из rss, а из своей же базы данных
  73.    $q3=mysql_query('select * from `'.$sad_prefix.'booru_pool_e` where `parent`='.$a[1]);
  74.    
  75.    if ($debug){$fo=fopen($sad_prefix.'/debug','a');
  76.     fwrite($fo,date('Y-m-d H:i:s').': q_c id = '.$q_c['id'].'; danbooru pool = '.(int)$a[1].'; count = '.mysql_num_rows($q2)."\r\n");fclose($fo);}
  77.    
  78.    while ($q3_c=mysql_fetch_assoc($q3)){
  79.     $feed[$q3_c['md5']]=array(
  80.      'title'=>$q3_c['title'],
  81.      'links'=>array('http://danbooru.donmai.us/posts/'.$q3_c['dan_id'].'/'),
  82.      'href' =>'http://danbooru.donmai.us/posts/'.$q3_c['dan_id'].'/',
  83.      'img'  =>'http://danbooru.donmai.us'.$q3_c['src'],
  84.      'time' =>0
  85.     );
  86.    }
  87.    
  88.    continue;
  89.   }
  90.  
  91.   //other places
  92.   if ($q_c['time']+6*3600<time() or strlen($q_c['rss'])<1){
  93.    $ch=curl_init($q_c['url']);
  94.    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  95.    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  96.    curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
  97.    $q_c['rss']=curl_exec($ch);
  98.    curl_close($ch);
  99.    mysql_query('update `'.$sad_prefix.'booru_rss` set `time`=UNIX_TIMESTAMP(), `rss`="'.sad_safe_mysql($q_c['rss']).
  100.                '" where `id`='.$q_c['id']);
  101.    
  102.   }
  103.  
  104.   preg_match('|^https?://([a-z0-9.-]+)|',$q_c['url'],$a);$domain=$a[1];
  105.   $xml=sad_xml_rawtext($q_c['rss']);
  106.   for ($i=1;true;$i++){
  107.    $ent=sad_xml_getnode($xml,'entry|'.$i);
  108.    if (gettype($ent)!=='object'){break;}
  109.  
  110.    $tmp  = new DOMDocument('1.0');
  111.    $tmp->appendChild($tmp->importNode(sad_xml_getnode($ent,'content'),true));
  112.    $buf = $tmp->saveHTML();
  113.    
  114.    preg_match('|([a-f0-9]{32,32})|',$buf,$a); $md5=$a[1];
  115.    if (!isset($feed[$md5])){$feed[$md5]=array('time'=>0,'links'=>array());}
  116.    preg_match('|src="(.+?)"|i',$buf,$a);$img=$a[1];
  117.  
  118.    if (preg_match('|^/|',$img)){$img='http://'.$domain.$img;}
  119.    if (numnum($feed[$md5]['img'])<numnum($img)){$feed[$md5]['img']=$img;}
  120.  
  121.    preg_match('|([0-9]{4,4})\-([0-9]{1,2})\-([0-9]{1,2})T([0-9]{1,2})\:([0-9]{1,2})\:([0-9]{1,2})Z|',sad_xml_getnode($ent,'updated#'),$a);//updated>2011-09-13T04:12:12Z
  122.    $time=gmmktime($a[4],$a[5],$a[6], $a[2],$a[3],$a[1]);
  123.    if ($feed[$md5]['time']>$time or $feed[$md5]['time']<1){$feed[$md5]['time']=$time;}
  124.  
  125.    if (strlen($feed[$md5]['title'])<1){$feed[$md5]['title']=sad_xml_getnode($ent,'title#'); }
  126.  
  127.    preg_match('|href="(.+?)"|i',$buf,$a);$href=$a[1];
  128.    if (numnum($feed[$md5]['href'])<numnum($href)){$feed[$md5]['href']=$href;}
  129.    
  130.    $xml=sad_xml_rawtext($q_c['rss']);
  131.    for ($j=1;true;$j++){
  132.     $link=sad_xml_getnode($ent,'link|'.$j);
  133.     if (gettype($link)!=='object'){break;}
  134.     array_push($feed[$md5]['links'],sad_xml_attribute($link,'href'));
  135.    }
  136.    
  137.   }
  138.   if ($debug){$fo=fopen($sad_prefix.'/debug','a');
  139.    fwrite($fo,date('Y-m-d H:i:s').': q_c id = '.$q_c['id'].'; i = '.$i."\r\n");fclose($fo);}
  140.  } // while $q_c
  141.  
  142.  if ($debug){$fo=fopen($sad_prefix.'/debug','a');
  143.   fwrite($fo,date('Y-m-d H:i:s').': count of feed = '.count($feed)."\r\n");fclose($fo);}
  144. // echo '<pre>';print_r($feed);
  145.  
  146.  $buf='';unset($md5);$time=0;
  147.  foreach ($feed as $md5 => $e){
  148.   if ($md5==''){continue;}
  149.   $descr='<a href="'.$e['href'].'" target="_blank"><img src="'.$e['img'].'"></a><br>';
  150.   foreach (array_unique($e['links']) as $link){
  151.    if (strlen($link)>0){$descr.='<br><a href="'.$link.'" target="_blank">'.$link.'</a>';}
  152.   }
  153.   $buf.='<item>
  154.      <title>'.sad_safe_xml($e['title']).'</title>
  155.      <link>'.sad_safe_xml($e['href']).'</link>
  156.      <description>'.sad_safe_xml($descr).'</description>
  157.      <pubDate>'.gmdate("D, d M Y H:i:s \G\M\T",$e['time']).'</pubDate>
  158.      <guid>booru-'.$md5.'</guid>
  159.    </item>';
  160.  
  161.   $time=max($time,$e['time']);
  162.  }
  163.  
  164.  if ($debug){$fo=fopen($sad_prefix.'/debug','a');
  165.   fwrite($fo,date('Y-m-d H:i:s').': strlen($buf) = '.strlen($buf)."\r\n");fclose($fo);}
  166.  $buf='<rss version="2.0"><channel><title>'.sad_safe_xml($q2_c['title'].' / Meta Booru').'</title><pubDate>'.
  167.        gmdate("D, d M Y H:i:s \G\M\T",$time).'</pubDate><ttl>'.round($q2_c['ttl']/60).'</ttl>'.$buf.'</channel></rss>';
  168.  
  169.  $sad_contype='text/xml';
  170.  echo '<'.'?xml version="1.0" encoding="utf-8"?'.'>';
  171.  echo $buf;
  172.  echo '<!-- generated now: '.date('Y-m-d H:i:sO').' -->';
  173.  mysql_query('update `'.$sad_prefix.'booru_feed` set `rss`="'.sad_safe_mysql($buf).'", `time`=UNIX_TIMESTAMP() where `id`='.$id);
  174. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement