# basic_webcrawler_demo.py

# basic_webcrawler_demo.py

import re, urllib
crawled_urls = []
def crawl(url):
    for new_url in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(url).read()):
        if new_url not in crawled_urls:
            print new_url
            crawled_urls.append(new_url)
url = 'http://www.yahoo.com/'
crawl(url)