Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #web scraper RSS
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- def get_website(url):
- article_list = []
- try:
- r = requests.get(url)
- print(f'Udało się nawiązać połączenie: {r.status_code}')
- soup = BeautifulSoup(r.content, features='xml')
- articles = soup.findAll('li', class_='news')
- for art in articles:
- title = art.find('h3', class_='title').text.replace('\n', '')
- title = title.replace('\t', '')
- link = art.find('div', class_='read-more')
- link = link.find('a')
- link = link.get('href').rsplit('/', 1)[1]
- link = url + '/' + link
- date = art.find('p', class_='date').text.replace('\n', ' ')
- date = date.lstrip()
- date = date.rstrip()
- desc = art.find('div', class_='desc').text.replace('\n', '')
- desc = desc.replace('\t', '')
- desc = desc.rsplit('.', 1)[0] + '.'
- article = {
- 'title': title,
- 'description': desc,
- 'link': link,
- 'date': date
- }
- article_list.append(article)
- print(article_list)
- return save_rss(article_list)
- except Exception as e:
- print(f'Nie udało się nawiązać połączenia. Zobacz co poszło nie tak: {e}')
- def save_rss(article_list):
- df = pd.DataFrame(article_list, columns=['title', 'description', 'link', 'date'])
- df.head()
- return df.to_xml('RSS_data.xml', index=False, encoding='utf-8')
- url = 'https://uke.gov.pl/blog'
- print(f'Nawiązuję połączenie z {url.rsplit("//", 1)[1]}...')
- get_website(url)
Add Comment
Please, Sign In to add comment