Advertisement
illpastethat

Craigslist python search

Jun 4th, 2014
582
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.05 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import urllib2
  3. import re
  4. import fnmatch
  5. import time
  6.  
  7. cities = ["washingtondc", "miami", "sfbay", "newyork", "losangeles", "sandiego"] #Craigslist subdomains
  8. keywords = ["web*", "wordpress", "site"] # What you want to search for...
  9. category = "cpg" # Craigslist category key
  10.  
  11. header = """<!DOCTYPE html>
  12.        <html lang="en">
  13.        <head>
  14.             <meta charset="utf-8"> 
  15.            <title>New Craigslist listings</title>
  16.            <link href="http://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.1.1/css/bootstrap.min.css" rel="stylesheet">
  17.            <link href='http://fonts.googleapis.com/css?family=Milonga' rel='stylesheet' type='text/css'>
  18.        </head>
  19.        <body>
  20.        """
  21. html = ""
  22.  
  23. for city in cities:
  24.     link = "http://" + city + ".craigslist.org"
  25.     url = urllib2.urlopen(link + "/" + category) # Fetches category page
  26.     content = url.read()
  27.     soup = BeautifulSoup(content)
  28.  
  29.     for a in soup.findAll('a',href=True): # This checks each link on the page
  30.         if re.findall('/cpg/*', a['href']):
  31.             for word in keywords: # This checks each keyword in the link
  32.                 matches = fnmatch.filter(str(a.contents).split(), word)
  33.                 if matches:
  34.                     listing = link + str(a['href'])
  35.                     print listing
  36.                    
  37.                     url = urllib2.urlopen(listing) # individual listing
  38.                     content = url.read()
  39.                     soup = BeautifulSoup(content)
  40.                    
  41.                     title = soup.title.string
  42.                     for match in matches:
  43.                         title = title.replace(str(match), '<b>' + str(match) + '</b>')
  44.                     body = ""
  45.                     body = str(soup.find(id="postingbody"))
  46.                    
  47.                     try:
  48.                         # Finds email address to reply to
  49.                         reply = soup.find("a", {"id": "replylink"})['href']
  50.                         url = urllib2.urlopen(link + reply)
  51.                         content = url.read()
  52.                         soup = BeautifulSoup(content)
  53.                        
  54.                         email = str(soup.find(class_='gmail')).replace(">gmail", ">Reply by email").replace(";su=", ";su=RE: ")
  55.                         email = email.replace("&amp;body=%0A%0A", " - I can help&amp;body=")
  56.                         email = email.replace('%0A"', '%0A%0AHello,%0A%0A%0A%0A--%0AAdam Bloom%0A571.969.ADAM (2326)%0Aadamc.bloom@gmail.com"')
  57.                     except:
  58.                         email = "Contact info in post"
  59.                    
  60.                     title = '<h2><a href="' + listing + '">' + title + '</a> [' + city + ']</h2>'
  61.                     email = '<h5>' + email + '</h5>'
  62.                     html = html + title + unicode(body, "utf-8") + email + '<hr>'
  63.                    
  64.     print "finished " + city
  65. f = open('C:\Users\Adam\Desktop\craigslist.html', 'w')
  66. html = header + html
  67. f.write(html.encode('utf8'))
  68. print "looks like we finished!"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement