Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """
- =====================================
- Webscraping 1
- =====================================
- Usage: %prog
- :Author: MerhuBerahu, https://github.com/MerhuBerahu
- :Date: 21/08/2020
- """
- #import statements
- import requests
- import bs4
- url = "https://robertsspaceindustries.com/galactapedia"
- html_doc = requests.get(url)
- html_doc.raise_for_status()
- #print(html_doc)
- soup = bs4.BeautifulSoup(html_doc.content, 'html.parser')
- type(soup)
- links = []
- links2 = []
- for link in soup.find_all('a'): # find all links in parsed data and create a list
- #print(link.get('href'))
- links.append(r"https://robertsspaceindustries.com" + link.get('href'))
- for i in links: # for each link in links list parse that link then look for links on that page and create a new list of links2
- html_doc = requests.get(i)
- soup = bs4.BeautifulSoup(html_doc.content, 'html.parser')
- soup.find_all('a')
- print(soup.get('href'))
- #links2.append(r"https://robertsspaceindustries.com" + i.get('href'))
- ### Commented out for time being ###
- """ for i in links:
- title = html_soup.find("strong", class_="c-title c-title--x-large")
- content = soup.find_all('div', class_='c-card__title')
- print(title)
- print(content) """
- print(links)
- print(links2)
Add Comment
Please, Sign In to add comment