Advertisement
Zgragselus

Robots.py

Feb 19th, 2023
831
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.47 KB | None | 0 0
  1. ## @package robots
  2. #
  3. #  Defines robots parser for crawlers
  4.  
  5. from Directive import Directive
  6. from Directive import DirectiveType
  7. from UserAgent import UserAgent
  8. from SystemLog import SystemLog
  9. import requests
  10.  
  11. ## Robots class for processing robots.txt files
  12. #
  13. #  Processes robots.txt file, storing all its rules per user-agent
  14. class Robots:
  15.     ## Constructor from user agent name
  16.     #  @param self The object pointer
  17.     #  @param url string, url with robots.txt file or domain under which it should be
  18.     def __init__(self, url):
  19.         self.agents = {}
  20.         self.__parserState = { 'user-agent': None }
  21.  
  22.         self.url = url
  23.         if (not url.endswith(".txt")):
  24.             self.url += "/robots.txt"
  25.  
  26.     ## Request and partse target url
  27.     #  @param self The object pointer
  28.     def request(self):
  29.         req = requests.get(self.url)
  30.  
  31.         if (req.status_code == 200):
  32.             lines = req.text.split("\n")
  33.             for l in lines:
  34.                 self.__parseLine(l)
  35.         else:
  36.             SystemLog.log({'login_name':'Zgragselus','password':'Test!234'}, 'Scrapper.Robots', 'Error opening robots.txt file at \'' + self.url + '\'', 'true')
  37.  
  38.     ## Parse single line from robots.txt
  39.     #  @param self The object pointer
  40.     #  @param line string, holds single line from robots.txt to parse
  41.     def __parseLine(self, line):
  42.         # Ignore comments
  43.         if line.startswith("#"):
  44.             return
  45.  
  46.         # Always work in lower case (case insensitivity of parser)
  47.         l = line.lower()
  48.  
  49.         # Split line into tokens
  50.         tokens = l.split(":")
  51.  
  52.         # Rule requires at least 2 tokens, otherwise it's not a rule
  53.         if (len(tokens) < 2):
  54.             return
  55.  
  56.         # Handle user agent, which switches state
  57.         if (tokens[0].strip() == "user-agent"):
  58.             agentName = tokens[1].strip()
  59.  
  60.             agent = UserAgent(agentName)
  61.  
  62.             self.agents[agentName] = agent
  63.             self.__parserState = agentName
  64.         # Handle allow directive
  65.         elif (tokens[0].strip() == "allow"):
  66.             d = Directive(DirectiveType.ALLOW, tokens[1].strip())
  67.             self.agents[self.__parserState].addRule(d)
  68.         # Hanlde disallow directive
  69.         elif (tokens[0].strip() == "disallow"):
  70.             d = Directive(DirectiveType.DISALLOW, tokens[1].strip())
  71.             self.agents[self.__parserState].addRule(d)
  72.         # Handle sitemap directive
  73.         elif (tokens[0].strip() == "sitemap"):
  74.             tokens.pop(0)
  75.             url = ":".join(tokens)
  76.  
  77.             d = Directive(DirectiveType.SITEMAP, url)
  78.             self.agents[self.__parserState].addRule(d)
  79.         # Handle crawl-delay directive
  80.         elif (tokens[0].strip() == "crawl-delay"):
  81.             d = Directive(DirectiveType.CRAWL_DELAY, tokens[1].strip())
  82.             self.agents[self.__parserState].addRule(d)
  83.  
  84.     ## Return string representation of object
  85.     #  @param self The object pointer
  86.     def __repr__(self):
  87.         return str(self.__class__) + '\n\t' + '\n\t'.join((str(item) + ' = ' + str(self.__dict__[item]) for item in sorted(self.__dict__)))
  88.  
  89.     ## Return readable representation of object
  90.     #  @param self The object pointer
  91.     def __str__(self):
  92.         return 'Robots { \'url\' : \'' + str(self.url) + '\', \'agents\' : \'' + str(self.agents) + '\' }'
  93.  
  94.     ## @var url
  95.     #  Robots URL
  96.  
  97.     ## @var agents
  98.     #  Robots URL
  99.  
  100.     ## @var __parserState
  101.     #  Private variable for parser state
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement