Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- ########################################################################
- #
- # google-hidden.py
- # Find results hidden by google, but accessible by other search engines.
- #
- # Copy? Right! 2014 Elias Schwerdtfeger, http://www.tamagothi.de/
- #
- # This program is free software, licesend under the terms of the pirate's
- # license. You can do with it whatever you want, as long as you do not
- # sue me. If you want to use this program and to sue me for it, please
- # buy a commercial license. You can read the full terms of the license
- # (in german language) at http://www.tamagothi.de/impressum/lizenz/
- #
- # Share and enjoy!
- #
- # $Id: google-hidden.py,v 1.2 2014/05/31 15:50:55 elias Exp $
- #
- # (All helpful comments are intentionally removed.)
- #
- ########################################################################
- RESULTS = 200
- import sys
- import urllib.parse
- import urllib.request
- import html.parser
- class LinkExtractor(html.parser.HTMLParser):
- def __init__(self, htmldoc):
- super().__init__()
- self.links = []
- self.feed(htmldoc)
- def handle_starttag(self, tag, attrs):
- if tag == 'a':
- for attr, content in attrs:
- if attr == 'href':
- self.links.append(content)
- class BaseSearchResult(object):
- def __init__(self, search_term):
- super().__init__()
- self.result_links = []
- for uri in self.perform_search(search_term):
- if self.filter_link(uri) and uri not in self.result_links:
- self.result_links.append(uri)
- self.result_links = self.postprocess_links(self.result_links)
- self.result_links.sort()
- def filter_link(self, uri):
- return true
- def perform_search(self, search_term):
- raise NotImplemented()
- def postprocess_links(self, linklist):
- return linklist
- def get_links_from_uri(self, uri):
- req = urllib.request.Request(uri)
- req.add_header('User-agent', 'Mozilla/5.0')
- httpdocument = urllib.request.urlopen(req)
- link_extractor = LinkExtractor(httpdocument.read().decode('utf-8'))
- return link_extractor.links
- class GoogleCommon(BaseSearchResult):
- def common_search_part(self, domain, search_term):
- params = urllib.parse.urlencode({'q': search_term, 'num': RESULTS})
- uri = 'http://{}/search?{}'.format(domain, params)
- return self.get_links_from_uri(uri)
- def postprocess_links(self, linklist):
- newlist = []
- for link in linklist:
- parse_res = urllib.parse.urlparse(link)
- params = urllib.parse.parse_qs(parse_res.query)
- # // ist ein Hack, um nur vollständige URIs zu haben...
- if 'q' in params and '//' in params['q'][0]:
- newlist.append(params['q'][0])
- return newlist
- class GoogleDe(GoogleCommon):
- def perform_search(self, search_term):
- return self.common_search_part('www.google.de', search_term)
- def filter_link(self, uri):
- return ('google.de' not in uri and
- 'google.com' not in uri and
- 'googleusercontent' not in uri and
- not uri.startswith('/search'))
- class GoogleCom(GoogleCommon):
- def perform_search(self, search_term):
- return self.common_search_part('www.google.com', search_term)
- def filter_link(self, uri):
- return ('google.com' not in uri and
- 'googleusercontent' not in uri and
- not uri.startswith('/search'))
- class Yahoo(BaseSearchResult):
- pass
- class Bing(BaseSearchResult):
- pass
- def not_found_in_google_de(term, engines=(GoogleCom, )):
- not_found = []
- google_de = GoogleDe(term)
- for engine in engines:
- other_engine = engine(term)
- for uri in other_engine.result_links:
- if uri not in google_de.result_links and uri not in not_found:
- not_found.append(uri)
- not_found.sort()
- return not_found
- def main():
- search = '+'.join([urllib.parse.quote(i.lower()) for i in sys.argv[1:]])
- for i in not_found_in_google_de(search):
- print(i)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement