Advertisement
here2share

# basic_webcrawler_demo.py

Jul 25th, 2015
394
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.32 KB | None | 0 0
  1. # basic_webcrawler_demo.py
  2.  
  3. import re, urllib
  4. crawled_urls = []
  5. def crawl(url):
  6.     for new_url in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(url).read()):
  7.         if new_url not in crawled_urls:
  8.             print new_url
  9.             crawled_urls.append(new_url)
  10. url = 'http://www.yahoo.com/'
  11. crawl(url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement