Advertisement
urksiful

Web Crawler

Nov 7th, 2015
247
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.97 KB | None | 0 0
  1. #$ python myPythonCrawler.py http://creadpag.com.ar
  2.  
  3. import sys, thread, Queue, re, urllib, urlparse, time, os, sys
  4. dupcheck = set()  
  5. q = Queue.Queue(100)
  6. q.put(sys.argv[1])
  7. def queueURLs(html, origLink):
  8.     for url in re.findall('''<a[^>]+href=["'](.[^"']+)["']''', html, re.I):
  9.         link = url.split("#", 1)[0] if url.startswith("http") else '{uri.scheme}://{uri.netloc}'.format(uri=urlparse.urlparse(origLink)) + url.split("#", 1)[0]
  10.         if link in dupcheck:
  11.             continue
  12.         dupcheck.add(link)
  13.         if len(dupcheck) > 99999:
  14.             dupcheck.clear()
  15.         q.put(link)
  16. def getHTML(link):
  17.     try:
  18.         html = urllib.urlopen(link).read()
  19.         open(str(time.time()) + ".html", "w").write("" % link  + "\n" + html)
  20.         queueURLs(html, link)
  21.     except (KeyboardInterrupt, SystemExit):
  22.         raise
  23.     except Exception:
  24.         pass
  25. while True:
  26.     thread.start_new_thread( getHTML, (q.get(),))
  27.     time.sleep(0.5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement