Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## @package robots
- #
- # Defines robots parser for crawlers
- from Directive import Directive
- from Directive import DirectiveType
- from UserAgent import UserAgent
- from SystemLog import SystemLog
- import requests
- ## Robots class for processing robots.txt files
- #
- # Processes robots.txt file, storing all its rules per user-agent
- class Robots:
- ## Constructor from user agent name
- # @param self The object pointer
- # @param url string, url with robots.txt file or domain under which it should be
- # @param eshop_id integer, eshop ID in database
- def __init__(self, url, eshop_id):
- self.reset()
- self.url = url
- if (not self.url.endswith(".txt")):
- if (self.url.endswith("/")):
- self.url += "robots.txt"
- else:
- self.url += "/robots.txt"
- self.eshop_id = eshop_id
- ## Resets state variables for class instance
- # @param self The object pointer
- def reset(self):
- self.agents = {}
- self.__parserState = { 'user-agent': None }
- ## Request and partse target url
- # @param self The object pointer
- # @param credentials List, each record holds base URL and then Credentials object instance
- # @param botName String, name of the bot (used as User-Agent string)
- def request(self, credentials, botName):
- self.reset()
- headers = {"User-Agent": botName}
- req = requests.get(self.url, headers=headers)
- if (req.status_code == 200):
- lines = req.text.split("\n")
- for l in lines:
- self.__parseLine(l)
- else:
- SystemLog.log(credentials, 'Scrapper.Robots', 'Error opening robots.txt file at \'' + self.url + '\'', 'true', self.eshop_id)
- ## Parse single line from robots.txt
- # @param self The object pointer
- # @param line string, holds single line from robots.txt to parse
- def __parseLine(self, line):
- # Ignore comments
- if line.startswith("#"):
- return
- # Always work in lower case (case insensitivity of parser)
- l = line.lower()
- # Split line into tokens
- tokens = l.split(":")
- # Rule requires at least 2 tokens, otherwise it's not a rule
- if (len(tokens) < 2):
- return
- # Handle user agent, which switches state
- if (tokens[0].strip() == "user-agent"):
- agentName = tokens[1].strip()
- agent = UserAgent(agentName)
- self.agents[agentName] = agent
- self.__parserState = agentName
- # Handle allow directive
- elif (tokens[0].strip() == "allow"):
- d = Directive(DirectiveType.ALLOW, tokens[1].strip())
- self.agents[self.__parserState].addRule(d)
- # Hanlde disallow directive
- elif (tokens[0].strip() == "disallow"):
- d = Directive(DirectiveType.DISALLOW, tokens[1].strip())
- self.agents[self.__parserState].addRule(d)
- # Handle sitemap directive
- elif (tokens[0].strip() == "sitemap"):
- tokens.pop(0)
- url = ":".join(tokens)
- d = Directive(DirectiveType.SITEMAP, url)
- self.agents[self.__parserState].addRule(d)
- # Handle crawl-delay directive
- elif (tokens[0].strip() == "crawl-delay"):
- d = Directive(DirectiveType.CRAWL_DELAY, tokens[1].strip())
- self.agents[self.__parserState].addRule(d)
- ## Serialize Robots class instance
- # @param self The object pointer
- # @returns Class instance serialized into string
- def serialize(self):
- # Serialize attributes
- res = self.url + "\n"
- # Serialize number of agents
- res += str(len(self.agents)) + "\n"
- # Serialize agents
- for agent in self.agents:
- res += self.agents[agent].serialize()
- return res
- ## Deserialize Robots class instance
- # @param self The object pointer
- # @param string String holding serialized Robots class instance
- def deserialize(self, string):
- # Split file by lines
- lines = string.split('\n')
- # Deserialize attributes
- self.url = lines[0]
- count = int(lines[1])
- # Deserialize agents
- blob = '\n'.join(lines[2:])
- agents = blob.split('UserAgent')
- while '' in agents:
- agents.remove('')
- i = 0
- while i < len(agents):
- agent = UserAgent('')
- agent.deserialize(agents[i])
- self.agents[agent.userAgent] = agent
- i = i + 1
- ## Return string representation of object
- # @param self The object pointer
- def __repr__(self):
- return str(self.__class__) + '\n\t' + '\n\t'.join((str(item) + ' = ' + str(self.__dict__[item]) for item in sorted(self.__dict__)))
- ## Return readable representation of object
- # @param self The object pointer
- def __str__(self):
- return 'Robots { \'url\' : \'' + str(self.url) + '\', \'agents\' : \'' + str(self.agents) + '\' }'
- ## @var url
- # Robots URL
- ## @var agents
- # Robots URL
- ## @var eshop_id
- # ID of eshop record in database
- ## @var __parserState
- # Private variable for parser state
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement