robotparser.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  8. """
  9. import urlparse,urllib
  10. __all__ = ["RobotFileParser"]
  11. debug = 0
  12. def _debug(msg):
  13. if debug: print msg
  14. class RobotFileParser:
  15. """ This class provides a set of methods to read, parse and answer
  16. questions about a single robots.txt file.
  17. """
  18. def __init__(self, url=''):
  19. self.entries = []
  20. self.default_entry = None
  21. self.disallow_all = False
  22. self.allow_all = False
  23. self.set_url(url)
  24. self.last_checked = 0
  25. def mtime(self):
  26. """Returns the time the robots.txt file was last fetched.
  27. This is useful for long-running web spiders that need to
  28. check for new robots.txt files periodically.
  29. """
  30. return self.last_checked
  31. def modified(self):
  32. """Sets the time the robots.txt file was last fetched to the
  33. current time.
  34. """
  35. import time
  36. self.last_checked = time.time()
  37. def set_url(self, url):
  38. """Sets the URL referring to a robots.txt file."""
  39. self.url = url
  40. self.host, self.path = urlparse.urlparse(url)[1:3]
  41. def read(self):
  42. """Reads the robots.txt URL and feeds it to the parser."""
  43. opener = URLopener()
  44. f = opener.open(self.url)
  45. lines = []
  46. line = f.readline()
  47. while line:
  48. lines.append(line.strip())
  49. line = f.readline()
  50. self.errcode = opener.errcode
  51. if self.errcode == 401 or self.errcode == 403:
  52. self.disallow_all = True
  53. _debug("disallow all")
  54. elif self.errcode >= 400:
  55. self.allow_all = True
  56. _debug("allow all")
  57. elif self.errcode == 200 and lines:
  58. _debug("parse lines")
  59. self.parse(lines)
  60. def _add_entry(self, entry):
  61. if "*" in entry.useragents:
  62. # the default entry is considered last
  63. self.default_entry = entry
  64. else:
  65. self.entries.append(entry)
  66. def parse(self, lines):
  67. """parse the input lines from a robots.txt file.
  68. We allow that a user-agent: line is not preceded by
  69. one or more blank lines."""
  70. state = 0
  71. linenumber = 0
  72. entry = Entry()
  73. for line in lines:
  74. linenumber = linenumber + 1
  75. if not line:
  76. if state==1:
  77. _debug("line %d: warning: you should insert"
  78. " allow: or disallow: directives below any"
  79. " user-agent: line" % linenumber)
  80. entry = Entry()
  81. state = 0
  82. elif state==2:
  83. self._add_entry(entry)
  84. entry = Entry()
  85. state = 0
  86. # remove optional comment and strip line
  87. i = line.find('#')
  88. if i>=0:
  89. line = line[:i]
  90. line = line.strip()
  91. if not line:
  92. continue
  93. line = line.split(':', 1)
  94. if len(line) == 2:
  95. line[0] = line[0].strip().lower()
  96. line[1] = urllib.unquote(line[1].strip())
  97. if line[0] == "user-agent":
  98. if state==2:
  99. _debug("line %d: warning: you should insert a blank"
  100. " line before any user-agent"
  101. " directive" % linenumber)
  102. self._add_entry(entry)
  103. entry = Entry()
  104. entry.useragents.append(line[1])
  105. state = 1
  106. elif line[0] == "disallow":
  107. if state==0:
  108. _debug("line %d: error: you must insert a user-agent:"
  109. " directive before this line" % linenumber)
  110. else:
  111. entry.rulelines.append(RuleLine(line[1], False))
  112. state = 2
  113. elif line[0] == "allow":
  114. if state==0:
  115. _debug("line %d: error: you must insert a user-agent:"
  116. " directive before this line" % linenumber)
  117. else:
  118. entry.rulelines.append(RuleLine(line[1], True))
  119. else:
  120. _debug("line %d: warning: unknown key %s" % (linenumber,
  121. line[0]))
  122. else:
  123. _debug("line %d: error: malformed line %s"%(linenumber, line))
  124. if state==2:
  125. self.entries.append(entry)
  126. _debug("Parsed rules:\n%s" % str(self))
  127. def can_fetch(self, useragent, url):
  128. """using the parsed robots.txt decide if useragent can fetch url"""
  129. _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
  130. (useragent, url))
  131. if self.disallow_all:
  132. return False
  133. if self.allow_all:
  134. return True
  135. # search for given user agent matches
  136. # the first match counts
  137. url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
  138. for entry in self.entries:
  139. if entry.applies_to(useragent):
  140. return entry.allowance(url)
  141. # try the default entry last
  142. if self.default_entry:
  143. return self.default_entry.allowance(url)
  144. # agent not found ==> access granted
  145. return True
  146. def __str__(self):
  147. ret = ""
  148. for entry in self.entries:
  149. ret = ret + str(entry) + "\n"
  150. return ret
  151. class RuleLine:
  152. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  153. (allowance==False) followed by a path."""
  154. def __init__(self, path, allowance):
  155. if path == '' and not allowance:
  156. # an empty value means allow all
  157. allowance = True
  158. self.path = urllib.quote(path)
  159. self.allowance = allowance
  160. def applies_to(self, filename):
  161. return self.path=="*" or filename.startswith(self.path)
  162. def __str__(self):
  163. return (self.allowance and "Allow" or "Disallow")+": "+self.path
  164. class Entry:
  165. """An entry has one or more user-agents and zero or more rulelines"""
  166. def __init__(self):
  167. self.useragents = []
  168. self.rulelines = []
  169. def __str__(self):
  170. ret = ""
  171. for agent in self.useragents:
  172. ret = ret + "User-agent: "+agent+"\n"
  173. for line in self.rulelines:
  174. ret = ret + str(line) + "\n"
  175. return ret
  176. def applies_to(self, useragent):
  177. """check if this entry applies to the specified agent"""
  178. # split the name token and make it lower case
  179. useragent = useragent.split("/")[0].lower()
  180. for agent in self.useragents:
  181. if agent=='*':
  182. # we have the catch-all agent
  183. return True
  184. agent = agent.lower()
  185. if agent in useragent:
  186. return True
  187. return False
  188. def allowance(self, filename):
  189. """Preconditions:
  190. - our agent applies to this entry
  191. - filename is URL decoded"""
  192. for line in self.rulelines:
  193. _debug((filename, str(line), line.allowance))
  194. if line.applies_to(filename):
  195. return line.allowance
  196. return True
  197. class URLopener(urllib.FancyURLopener):
  198. def __init__(self, *args):
  199. urllib.FancyURLopener.__init__(self, *args)
  200. self.errcode = 200
  201. def http_error_default(self, url, fp, errcode, errmsg, headers):
  202. self.errcode = errcode
  203. return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
  204. errmsg, headers)
  205. def _check(a,b):
  206. if not b:
  207. ac = "access denied"
  208. else:
  209. ac = "access allowed"
  210. if a!=b:
  211. print "failed"
  212. else:
  213. print "ok (%s)" % ac
  214. print
  215. def _test():
  216. global debug
  217. rp = RobotFileParser()
  218. debug = 1
  219. # robots.txt that exists, gotten to by redirection
  220. rp.set_url('http://www.musi-cal.com/robots.txt')
  221. rp.read()
  222. # test for re.escape
  223. _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
  224. # this should match the first rule, which is a disallow
  225. _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
  226. # various cherry pickers
  227. _check(rp.can_fetch('CherryPickerSE',
  228. 'http://www.musi-cal.com/cgi-bin/event-search'
  229. '?city=San+Francisco'), 0)
  230. _check(rp.can_fetch('CherryPickerSE/1.0',
  231. 'http://www.musi-cal.com/cgi-bin/event-search'
  232. '?city=San+Francisco'), 0)
  233. _check(rp.can_fetch('CherryPickerSE/1.5',
  234. 'http://www.musi-cal.com/cgi-bin/event-search'
  235. '?city=San+Francisco'), 0)
  236. # case sensitivity
  237. _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
  238. _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
  239. # substring test
  240. _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
  241. # tests for catch-all * agent
  242. _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
  243. _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
  244. _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
  245. _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
  246. # robots.txt that does not exist
  247. rp.set_url('http://www.lycos.com/robots.txt')
  248. rp.read()
  249. _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
  250. if __name__ == '__main__':
  251. _test()