Advertisement
LightProgrammer000

Noticias [globo]

Sep 28th, 2019
1,323
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.41 KB | None | 0 0
  1. '''
  2. Python: 2
  3. Programa: Crawler
  4. '''
  5.  
  6. # Bibliotecas
  7. import sys
  8. import bs4
  9. import time
  10. import requests
  11.  
  12.  
  13. # Metodo
  14. def utf8():
  15.     reload(sys)
  16.     sys.setdefaultencoding('utf-8')
  17.  
  18.  
  19. while True:
  20.  
  21.     try:
  22.  
  23.         utf8()
  24.  
  25.         # Configuracoes
  26.         url = "https://g1.globo.com"
  27.         cabecalho = {"user-agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)"}
  28.  
  29.         # Html aperfeicoado
  30.         html = requests.get(url=url, headers=cabecalho).text
  31.         html_super = bs4.BeautifulSoup(html, "html.parser")
  32.  
  33.         # Crawler
  34.         noticias = html_super.find_all(class_="feed-post-link gui-color-primary gui-color-hover")
  35.         imagem = html_super.select("picture > img")
  36.         #imagem = html_super.find_all("img", attrs={"class": "bstn-fd-picture-image"})
  37.  
  38.         print("\n *** Principais noticias *** ")
  39.         for i in range(0, len(noticias)):
  40.  
  41.             print("\n * Noticia [" + str(i+1) + "] : " + noticias[i].get_text())
  42.             print(" - Link: " + noticias[i].get("href"))
  43.  
  44.  
  45.         print("\n *** Principais imagens *** ")
  46.         for j in range(0, len(imagem)):
  47.  
  48.             print("\n - Titulo: " + str(imagem[j].get("title")))
  49.             print(" * Link: " + imagem[j].get("src"))
  50.  
  51.         print("\n\n - Dentro de 10 minutos site atualizara...")
  52.         time.sleep(10)
  53.  
  54.     except Exception as e:
  55.         print e
  56.         pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement