fahadkalil

edd_nltk_sumarizacao

Jun 23rd, 2020
278
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.69 KB | None | 0 0
  1. import nltk
  2. from nltk.corpus import stopwords
  3. from nltk.tokenize import word_tokenize
  4. from nltk.tokenize import sent_tokenize
  5. from nltk.probability import FreqDist
  6.  
  7. from bs4 import BeautifulSoup
  8. from urllib.request import Request, urlopen
  9. from io import StringIO
  10. from string import punctuation
  11.  
  12. from collections import defaultdict
  13. from heapq import nlargest
  14.  
  15. #nltk.download()
  16.  
  17. ## usando Request e bs4
  18.  
  19. link = Request('https://rduirapuru.com.br/geral/fim-do-impasse-empresa-confirma-execucao-do-projeto-para-o-aeroporto-de-passo-fundo-e-estado-da-prazo-para-documentos/',
  20.                headers={'User-Agent': 'Mozilla/5.0'})
  21.  
  22. pagina = urlopen(link).read().decode('utf-8', 'ignore')
  23.  
  24. soup = BeautifulSoup(pagina, "lxml")
  25.  
  26. paragrafos = soup.find("article").find_all('p')
  27.  
  28. texto_buf = StringIO()
  29. for p in paragrafos:    
  30.     texto_buf.write(p.text + " ")    
  31.  
  32. texto = texto_buf.getvalue()
  33.  
  34. texto = texto.replace("Ltda.","Ltda")
  35.  
  36. ## usando NLTK
  37. sentencas = sent_tokenize(texto)
  38. palavras = word_tokenize(texto.lower())
  39.  
  40. stopwords = set(stopwords.words('portuguese') + list(punctuation))
  41.  
  42. palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
  43.  
  44. print(palavras_sem_stopwords)
  45.  
  46. frequencia = FreqDist(palavras_sem_stopwords)
  47.  
  48. sentencas_importantes = defaultdict(int)
  49.  
  50. for i, sentenca in enumerate(sentencas):
  51.     for palavra in word_tokenize(sentenca.lower()):
  52.         if palavra in frequencia:
  53.             sentencas_importantes[i] += frequencia[palavra]
  54.  
  55.  
  56. idx_sentencas_importantes = nlargest(4, sentencas_importantes, sentencas_importantes.get)
  57.  
  58. print("<<<< RESUMO >>>>")
  59. for i in sorted(idx_sentencas_importantes):
  60.     print(sentencas[i])
Add Comment
Please, Sign In to add comment