Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize
- from nltk.tokenize import sent_tokenize
- from nltk.probability import FreqDist
- from bs4 import BeautifulSoup
- from urllib.request import Request, urlopen
- from io import StringIO
- from string import punctuation
- from collections import defaultdict
- from heapq import nlargest
- #nltk.download()
- ## usando Request e bs4
- link = Request('https://rduirapuru.com.br/geral/fim-do-impasse-empresa-confirma-execucao-do-projeto-para-o-aeroporto-de-passo-fundo-e-estado-da-prazo-para-documentos/',
- headers={'User-Agent': 'Mozilla/5.0'})
- pagina = urlopen(link).read().decode('utf-8', 'ignore')
- soup = BeautifulSoup(pagina, "lxml")
- paragrafos = soup.find("article").find_all('p')
- texto_buf = StringIO()
- for p in paragrafos:
- texto_buf.write(p.text + " ")
- texto = texto_buf.getvalue()
- texto = texto.replace("Ltda.","Ltda")
- ## usando NLTK
- sentencas = sent_tokenize(texto)
- palavras = word_tokenize(texto.lower())
- stopwords = set(stopwords.words('portuguese') + list(punctuation))
- palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
- print(palavras_sem_stopwords)
- frequencia = FreqDist(palavras_sem_stopwords)
- sentencas_importantes = defaultdict(int)
- for i, sentenca in enumerate(sentencas):
- for palavra in word_tokenize(sentenca.lower()):
- if palavra in frequencia:
- sentencas_importantes[i] += frequencia[palavra]
- idx_sentencas_importantes = nlargest(4, sentencas_importantes, sentencas_importantes.get)
- print("<<<< RESUMO >>>>")
- for i in sorted(idx_sentencas_importantes):
- print(sentencas[i])
Add Comment
Please, Sign In to add comment