Advertisement
nitestryker

dork downloader (PDFs)

Dec 19th, 2023
1,139
0
Never
1
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.80 KB | None | 0 0
  1. #!/bin/python
  2. # Copyright 2023 Nitestryker
  3.  
  4. # This software is provided 'as-is', without any express or implied
  5. # warranty.  In no event will the authors be held liable for any damages
  6. #  arising from the use of this software.
  7.  
  8. #  Permission is granted to anyone to use this software for any purpose,
  9. #  including commercial applications, and to alter it and redistribute it
  10. #  freely, subject to the following restrictions:
  11.  
  12. #  1. The origin of this software must not be misrepresented; you must not
  13. #     claim that you wrote the original software. If you use this software
  14. #     in a product, an acknowledgment in the product documentation would be
  15. #    appreciated but is not required.
  16. #  2. Altered source versions must be plainly marked as such, and must not be
  17. #     misrepresented as being the original software.
  18. #  3. This notice may not be removed or altered from any source distribution.
  19.  
  20. import requests
  21. import os
  22.  
  23. def download_pdf(url, output_path):
  24.     #response = requests.get(url, stream=True)
  25.     response = requests.get(url, stream=True, verify=False) # Disable SSL certificate verification
  26.     if response.status_code == 200:
  27.         with open(output_path, 'wb') as file:
  28.             for chunk in response.iter_content(chunk_size=1024):
  29.                 if chunk:
  30.                     file.write(chunk)
  31.         print(f"Downloaded: {output_path}")
  32.     else:
  33.         print(f"Failed to download: {url} (Status Code: {response.status_code})")
  34.  
  35. def search_and_download(query, num_results, base_url, output_dir):
  36.     # Perform the Google search using the query
  37.     search_url = f"{base_url}/search?q={query}"
  38.     response = requests.get(search_url)
  39.     search_results = response.text
  40.  
  41.     # Create the output directory if it doesn't exist
  42.     if not os.path.exists(output_dir):
  43.         os.makedirs(output_dir)
  44.  
  45.     # Keep track of the number of downloaded PDFs
  46.     downloaded_count = 0
  47.  
  48.     # Extract and download PDF links
  49.     start_index = 0
  50.     while downloaded_count < num_results:
  51.         start_link = search_results.find('<a href="/url?q=', start_index)
  52.         end_link = search_results.find('&amp;', start_link + 1)
  53.         if start_link == -1 or end_link == -1:
  54.             break
  55.  
  56.         url = search_results[start_link + 16: end_link]
  57.         if url.endswith('.pdf'):
  58.             filename = url.split("/")[-1]
  59.             output_path = os.path.join(output_dir, filename)
  60.             download_pdf(url, output_path)
  61.             downloaded_count += 1
  62.  
  63.         start_index = end_link
  64.  
  65. # Set the query, number of results to retrieve, base URL, and output directory
  66. query = 'filetype:pdf programming'
  67. num_results = 10
  68. base_url = "https://www.google.com"
  69. output_dir = "pdfs"
  70.  
  71. # Call the search_and_download function
  72. search_and_download(query, num_results, base_url, output_dir)
  73.  
Advertisement
Comments
  • nitestryker
    1 year (edited)
    # text 0.19 KB | 0 0
    1. This version only works for PDFs because it searches for files that end with the .pdf file type I am going make another version you will be able to select what file type to download stay tuned
    2.  
Add Comment
Please, Sign In to add comment
Advertisement