Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import re
- def get_html(host):
- headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.698.0 Safari/534.24' }
- req = urllib2.Request('https://' + host, None, headers) # - Change 'https' to ssl check result
- global html
- html = urllib2.urlopen(req).read()
- return html
- # - Clean up code, remove 'emails' and figure out a way to pass it to output another way
- def extract_url(host):
- found = []
- extracted = []
- emails = []
- html = get_html(host)
- urls = re.findall(r'href=[\'"]?([^\'" >]+)', html)
- for x in urls:
- if host in x:
- if 'mailto:' in x:
- emails.append(x[7:])
- found.append(x[7:])
- else:
- extracted.append(x)
- found.append(x)
- return found
- # - /
- # - Add ssl identification
- # - Using this as input until this module gets imported into framework
- host = raw_input('Host: ')
- # - /
- # - Fix this to make it visit html of all the urls it get's from first run
- for x in extract_url(host):
- print x
- # - /
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement