Advertisement
Zgragselus

Robots.py

Nov 17th, 2023
911
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.28 KB | None | 0 0
  1. ## @package robots
  2. #
  3. #  Defines robots parser for crawlers
  4.  
  5. from Directive import Directive
  6. from Directive import DirectiveType
  7. from UserAgent import UserAgent
  8. from SystemLog import SystemLog
  9. import requests
  10.  
  11. ## Robots class for processing robots.txt files
  12. #
  13. #  Processes robots.txt file, storing all its rules per user-agent
  14. class Robots:
  15.     ## Constructor from user agent name
  16.     #  @param self The object pointer
  17.     #  @param url string, url with robots.txt file or domain under which it should be
  18.     #  @param eshop_id integer, eshop ID in database
  19.     def __init__(self, url, eshop_id):
  20.         self.reset()
  21.  
  22.         self.url = url
  23.         if (not self.url.endswith(".txt")):
  24.             if (self.url.endswith("/")):
  25.                 self.url += "robots.txt"
  26.             else:
  27.                 self.url += "/robots.txt"
  28.  
  29.         self.eshop_id = eshop_id
  30.  
  31.     ## Resets state variables for class instance
  32.     #  @param self The object pointer
  33.     def reset(self):
  34.         self.agents = {}
  35.         self.__parserState = { 'user-agent': None }
  36.  
  37.     ## Request and partse target url
  38.     #  @param self The object pointer
  39.     #  @param credentials List, each record holds base URL and then Credentials object instance
  40.     #  @param botName String, name of the bot (used as User-Agent string)
  41.     def request(self, credentials, botName):
  42.         self.reset()
  43.  
  44.         headers = {"User-Agent": botName}
  45.         req = requests.get(self.url, headers=headers)
  46.  
  47.         if (req.status_code == 200):
  48.             lines = req.text.split("\n")
  49.             for l in lines:
  50.                 self.__parseLine(l)
  51.         else:
  52.             SystemLog.log(credentials, 'Scrapper.Robots', 'Error opening robots.txt file at \'' + self.url + '\'', 'true', self.eshop_id)
  53.  
  54.     ## Parse single line from robots.txt
  55.     #  @param self The object pointer
  56.     #  @param line string, holds single line from robots.txt to parse
  57.     def __parseLine(self, line):
  58.         # Ignore comments
  59.         if line.startswith("#"):
  60.             return
  61.  
  62.         # Always work in lower case (case insensitivity of parser)
  63.         l = line.lower()
  64.  
  65.         # Split line into tokens
  66.         tokens = l.split(":")
  67.  
  68.         # Rule requires at least 2 tokens, otherwise it's not a rule
  69.         if (len(tokens) < 2):
  70.             return
  71.  
  72.         # Handle user agent, which switches state
  73.         if (tokens[0].strip() == "user-agent"):
  74.             agentName = tokens[1].strip()
  75.  
  76.             agent = UserAgent(agentName)
  77.  
  78.             self.agents[agentName] = agent
  79.             self.__parserState = agentName
  80.         # Handle allow directive
  81.         elif (tokens[0].strip() == "allow"):
  82.             d = Directive(DirectiveType.ALLOW, tokens[1].strip())
  83.             self.agents[self.__parserState].addRule(d)
  84.         # Hanlde disallow directive
  85.         elif (tokens[0].strip() == "disallow"):
  86.             d = Directive(DirectiveType.DISALLOW, tokens[1].strip())
  87.             self.agents[self.__parserState].addRule(d)
  88.         # Handle sitemap directive
  89.         elif (tokens[0].strip() == "sitemap"):
  90.             tokens.pop(0)
  91.             url = ":".join(tokens)
  92.  
  93.             d = Directive(DirectiveType.SITEMAP, url)
  94.             self.agents[self.__parserState].addRule(d)
  95.         # Handle crawl-delay directive
  96.         elif (tokens[0].strip() == "crawl-delay"):
  97.             d = Directive(DirectiveType.CRAWL_DELAY, tokens[1].strip())
  98.             self.agents[self.__parserState].addRule(d)
  99.  
  100.     ## Serialize Robots class instance
  101.     #  @param self The object pointer
  102.     #  @returns Class instance serialized into string
  103.     def serialize(self):
  104.         # Serialize attributes
  105.         res = self.url + "\n"
  106.  
  107.         # Serialize number of agents
  108.         res += str(len(self.agents)) + "\n"
  109.  
  110.         # Serialize agents
  111.         for agent in self.agents:
  112.             res += self.agents[agent].serialize()
  113.  
  114.         return res
  115.  
  116.     ## Deserialize Robots class instance
  117.     #  @param self The object pointer
  118.     #  @param string String holding serialized Robots class instance
  119.     def deserialize(self, string):
  120.         # Split file by lines
  121.         lines = string.split('\n')
  122.  
  123.         # Deserialize attributes
  124.         self.url = lines[0]
  125.  
  126.         count = int(lines[1])
  127.  
  128.         # Deserialize agents
  129.         blob = '\n'.join(lines[2:])
  130.        
  131.         agents = blob.split('UserAgent')
  132.         while '' in agents:
  133.             agents.remove('')
  134.  
  135.         i = 0
  136.         while i < len(agents):
  137.             agent = UserAgent('')
  138.             agent.deserialize(agents[i])
  139.             self.agents[agent.userAgent] = agent
  140.             i = i + 1
  141.  
  142.     ## Return string representation of object
  143.     #  @param self The object pointer
  144.     def __repr__(self):
  145.         return str(self.__class__) + '\n\t' + '\n\t'.join((str(item) + ' = ' + str(self.__dict__[item]) for item in sorted(self.__dict__)))
  146.  
  147.     ## Return readable representation of object
  148.     #  @param self The object pointer
  149.     def __str__(self):
  150.         return 'Robots { \'url\' : \'' + str(self.url) + '\', \'agents\' : \'' + str(self.agents) + '\' }'
  151.  
  152.     ## @var url
  153.     #  Robots URL
  154.  
  155.     ## @var agents
  156.     #  Robots URL
  157.  
  158.     ## @var eshop_id
  159.     #  ID of eshop record in database
  160.  
  161.     ## @var __parserState
  162.     #  Private variable for parser state
  163.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement