Advertisement
metalx1000

Very Basic BASH Webcrawler

Dec 14th, 2014
733
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 0.97 KB | None | 0 0
  1. #!/bin/bash
  2.  
  3. #gets user input
  4. url="$1"
  5. #create uniq file name for this search
  6. searched="$(date +%s).lst"
  7. touch $searched
  8. echo "Saving to $searched"
  9.  
  10. function crawl(){
  11.     url="$1"
  12.  
  13.     #s will be used to check if url has already been searched
  14.     s=0
  15.  
  16.     #check searched file for current url
  17.     grep "$url" $searched && s=1 || s=0
  18.  
  19.     #if current url has been search skip it
  20.     if [[ $s == 0 ]];
  21.     then
  22.         #add current url to searched list
  23.         echo "$url" >> $searched
  24.         echo "Crawling ${url}..."
  25.         #get all urls on page
  26.         lynx --dump "$url"|\
  27.             sed 's/http/\nhttp/g'|\
  28.             grep -e "^http:" -e "^https:"|\
  29.             sed 's/%3A%2F%2F/:\/\//g'|\
  30.             sort -u| while read line
  31.                 do
  32.                     #crawl through each url
  33.                     crawl "$line"
  34.                 done
  35.     else
  36.         #reset searched check
  37.         s=0
  38.     fi
  39. }
  40.  
  41. #calls crawl function
  42. crawl "$url"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement