shlex.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # -*- coding: iso-8859-1 -*-
  2. """A lexical analyzer class for simple shell-like syntaxes."""
  3. # Module and documentation by Eric S. Raymond, 21 Dec 1998
  4. # Input stacking and error message cleanup added by ESR, March 2000
  5. # push_source() and pop_source() made explicit by ESR, January 2001.
  6. # Posix compliance, split(), string arguments, and
  7. # iterator interface by Gustavo Niemeyer, April 2003.
  8. import os.path
  9. import sys
  10. from collections import deque
  11. try:
  12. from cStringIO import StringIO
  13. except ImportError:
  14. from StringIO import StringIO
  15. __all__ = ["shlex", "split"]
  16. class shlex:
  17. "A lexical analyzer class for simple shell-like syntaxes."
  18. def __init__(self, instream=None, infile=None, posix=False):
  19. if isinstance(instream, basestring):
  20. instream = StringIO(instream)
  21. if instream is not None:
  22. self.instream = instream
  23. self.infile = infile
  24. else:
  25. self.instream = sys.stdin
  26. self.infile = None
  27. self.posix = posix
  28. if posix:
  29. self.eof = None
  30. else:
  31. self.eof = ''
  32. self.commenters = '#'
  33. self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
  34. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
  35. if self.posix:
  36. self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
  37. 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
  38. self.whitespace = ' \t\r\n'
  39. self.whitespace_split = False
  40. self.quotes = '\'"'
  41. self.escape = '\\'
  42. self.escapedquotes = '"'
  43. self.state = ' '
  44. self.pushback = deque()
  45. self.lineno = 1
  46. self.debug = 0
  47. self.token = ''
  48. self.filestack = deque()
  49. self.source = None
  50. if self.debug:
  51. print 'shlex: reading from %s, line %d' \
  52. % (self.instream, self.lineno)
  53. def push_token(self, tok):
  54. "Push a token onto the stack popped by the get_token method"
  55. if self.debug >= 1:
  56. print "shlex: pushing token " + repr(tok)
  57. self.pushback.appendleft(tok)
  58. def push_source(self, newstream, newfile=None):
  59. "Push an input source onto the lexer's input source stack."
  60. if isinstance(newstream, basestring):
  61. newstream = StringIO(newstream)
  62. self.filestack.appendleft((self.infile, self.instream, self.lineno))
  63. self.infile = newfile
  64. self.instream = newstream
  65. self.lineno = 1
  66. if self.debug:
  67. if newfile is not None:
  68. print 'shlex: pushing to file %s' % (self.infile,)
  69. else:
  70. print 'shlex: pushing to stream %s' % (self.instream,)
  71. def pop_source(self):
  72. "Pop the input source stack."
  73. self.instream.close()
  74. (self.infile, self.instream, self.lineno) = self.filestack.popleft()
  75. if self.debug:
  76. print 'shlex: popping to %s, line %d' \
  77. % (self.instream, self.lineno)
  78. self.state = ' '
  79. def get_token(self):
  80. "Get a token from the input stream (or from stack if it's nonempty)"
  81. if self.pushback:
  82. tok = self.pushback.popleft()
  83. if self.debug >= 1:
  84. print "shlex: popping token " + repr(tok)
  85. return tok
  86. # No pushback. Get a token.
  87. raw = self.read_token()
  88. # Handle inclusions
  89. if self.source is not None:
  90. while raw == self.source:
  91. spec = self.sourcehook(self.read_token())
  92. if spec:
  93. (newfile, newstream) = spec
  94. self.push_source(newstream, newfile)
  95. raw = self.get_token()
  96. # Maybe we got EOF instead?
  97. while raw == self.eof:
  98. if not self.filestack:
  99. return self.eof
  100. else:
  101. self.pop_source()
  102. raw = self.get_token()
  103. # Neither inclusion nor EOF
  104. if self.debug >= 1:
  105. if raw != self.eof:
  106. print "shlex: token=" + repr(raw)
  107. else:
  108. print "shlex: token=EOF"
  109. return raw
  110. def read_token(self):
  111. quoted = False
  112. escapedstate = ' '
  113. while True:
  114. nextchar = self.instream.read(1)
  115. if nextchar == '\n':
  116. self.lineno = self.lineno + 1
  117. if self.debug >= 3:
  118. print "shlex: in state", repr(self.state), \
  119. "I see character:", repr(nextchar)
  120. if self.state is None:
  121. self.token = '' # past end of file
  122. break
  123. elif self.state == ' ':
  124. if not nextchar:
  125. self.state = None # end of file
  126. break
  127. elif nextchar in self.whitespace:
  128. if self.debug >= 2:
  129. print "shlex: I see whitespace in whitespace state"
  130. if self.token or (self.posix and quoted):
  131. break # emit current token
  132. else:
  133. continue
  134. elif nextchar in self.commenters:
  135. self.instream.readline()
  136. self.lineno = self.lineno + 1
  137. elif self.posix and nextchar in self.escape:
  138. escapedstate = 'a'
  139. self.state = nextchar
  140. elif nextchar in self.wordchars:
  141. self.token = nextchar
  142. self.state = 'a'
  143. elif nextchar in self.quotes:
  144. if not self.posix:
  145. self.token = nextchar
  146. self.state = nextchar
  147. elif self.whitespace_split:
  148. self.token = nextchar
  149. self.state = 'a'
  150. else:
  151. self.token = nextchar
  152. if self.token or (self.posix and quoted):
  153. break # emit current token
  154. else:
  155. continue
  156. elif self.state in self.quotes:
  157. quoted = True
  158. if not nextchar: # end of file
  159. if self.debug >= 2:
  160. print "shlex: I see EOF in quotes state"
  161. # XXX what error should be raised here?
  162. raise ValueError, "No closing quotation"
  163. if nextchar == self.state:
  164. if not self.posix:
  165. self.token = self.token + nextchar
  166. self.state = ' '
  167. break
  168. else:
  169. self.state = 'a'
  170. elif self.posix and nextchar in self.escape and \
  171. self.state in self.escapedquotes:
  172. escapedstate = self.state
  173. self.state = nextchar
  174. else:
  175. self.token = self.token + nextchar
  176. elif self.state in self.escape:
  177. if not nextchar: # end of file
  178. if self.debug >= 2:
  179. print "shlex: I see EOF in escape state"
  180. # XXX what error should be raised here?
  181. raise ValueError, "No escaped character"
  182. # In posix shells, only the quote itself or the escape
  183. # character may be escaped within quotes.
  184. if escapedstate in self.quotes and \
  185. nextchar != self.state and nextchar != escapedstate:
  186. self.token = self.token + self.state
  187. self.token = self.token + nextchar
  188. self.state = escapedstate
  189. elif self.state == 'a':
  190. if not nextchar:
  191. self.state = None # end of file
  192. break
  193. elif nextchar in self.whitespace:
  194. if self.debug >= 2:
  195. print "shlex: I see whitespace in word state"
  196. self.state = ' '
  197. if self.token or (self.posix and quoted):
  198. break # emit current token
  199. else:
  200. continue
  201. elif nextchar in self.commenters:
  202. self.instream.readline()
  203. self.lineno = self.lineno + 1
  204. if self.posix:
  205. self.state = ' '
  206. if self.token or (self.posix and quoted):
  207. break # emit current token
  208. else:
  209. continue
  210. elif self.posix and nextchar in self.quotes:
  211. self.state = nextchar
  212. elif self.posix and nextchar in self.escape:
  213. escapedstate = 'a'
  214. self.state = nextchar
  215. elif nextchar in self.wordchars or nextchar in self.quotes \
  216. or self.whitespace_split:
  217. self.token = self.token + nextchar
  218. else:
  219. self.pushback.appendleft(nextchar)
  220. if self.debug >= 2:
  221. print "shlex: I see punctuation in word state"
  222. self.state = ' '
  223. if self.token:
  224. break # emit current token
  225. else:
  226. continue
  227. result = self.token
  228. self.token = ''
  229. if self.posix and not quoted and result == '':
  230. result = None
  231. if self.debug > 1:
  232. if result:
  233. print "shlex: raw token=" + repr(result)
  234. else:
  235. print "shlex: raw token=EOF"
  236. return result
  237. def sourcehook(self, newfile):
  238. "Hook called on a filename to be sourced."
  239. if newfile[0] == '"':
  240. newfile = newfile[1:-1]
  241. # This implements cpp-like semantics for relative-path inclusion.
  242. if isinstance(self.infile, basestring) and not os.path.isabs(newfile):
  243. newfile = os.path.join(os.path.dirname(self.infile), newfile)
  244. return (newfile, open(newfile, "r"))
  245. def error_leader(self, infile=None, lineno=None):
  246. "Emit a C-compiler-like, Emacs-friendly error-message leader."
  247. if infile is None:
  248. infile = self.infile
  249. if lineno is None:
  250. lineno = self.lineno
  251. return "\"%s\", line %d: " % (infile, lineno)
  252. def __iter__(self):
  253. return self
  254. def next(self):
  255. token = self.get_token()
  256. if token == self.eof:
  257. raise StopIteration
  258. return token
  259. def split(s, comments=False):
  260. lex = shlex(s, posix=True)
  261. lex.whitespace_split = True
  262. if not comments:
  263. lex.commenters = ''
  264. return list(lex)
  265. if __name__ == '__main__':
  266. if len(sys.argv) == 1:
  267. lexer = shlex()
  268. else:
  269. file = sys.argv[1]
  270. lexer = shlex(open(file), file)
  271. while 1:
  272. tt = lexer.get_token()
  273. if tt:
  274. print "Token: " + repr(tt)
  275. else:
  276. break