Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # basic_webcrawler_demo.py
- import re, urllib
- crawled_urls = []
- def crawl(url):
- for new_url in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(url).read()):
- if new_url not in crawled_urls:
- print new_url
- crawled_urls.append(new_url)
- url = 'http://www.yahoo.com/'
- crawl(url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement