mimetypes.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. """Guess the MIME type of a file.
  2. This module defines two useful functions:
  3. guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
  4. guess_extension(type, strict=1) -- guess the extension for a given MIME type.
  5. It also contains the following, for tuning the behavior:
  6. Data:
  7. knownfiles -- list of files to parse
  8. inited -- flag set when init() has been called
  9. suffix_map -- dictionary mapping suffixes to suffixes
  10. encodings_map -- dictionary mapping suffixes to encodings
  11. types_map -- dictionary mapping suffixes to types
  12. Functions:
  13. init([files]) -- parse a list of files, default knownfiles
  14. read_mime_types(file) -- parse one file, return a dictionary or None
  15. """
  16. import os
  17. import posixpath
  18. import urllib
  19. __all__ = [
  20. "guess_type","guess_extension","guess_all_extensions",
  21. "add_type","read_mime_types","init"
  22. ]
  23. knownfiles = [
  24. "/etc/mime.types",
  25. "/usr/local/etc/httpd/conf/mime.types",
  26. "/usr/local/lib/netscape/mime.types",
  27. "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
  28. "/usr/local/etc/mime.types", # Apache 1.3
  29. ]
  30. inited = False
  31. class MimeTypes:
  32. """MIME-types datastore.
  33. This datastore can handle information from mime.types-style files
  34. and supports basic determination of MIME type from a filename or
  35. URL, and can guess a reasonable extension given a MIME type.
  36. """
  37. def __init__(self, filenames=(), strict=True):
  38. if not inited:
  39. init()
  40. self.encodings_map = encodings_map.copy()
  41. self.suffix_map = suffix_map.copy()
  42. self.types_map = ({}, {}) # dict for (non-strict, strict)
  43. self.types_map_inv = ({}, {})
  44. for (ext, type) in types_map.items():
  45. self.add_type(type, ext, True)
  46. for (ext, type) in common_types.items():
  47. self.add_type(type, ext, False)
  48. for name in filenames:
  49. self.read(name, strict)
  50. def add_type(self, type, ext, strict=True):
  51. """Add a mapping between a type and an extension.
  52. When the extension is already known, the new
  53. type will replace the old one. When the type
  54. is already known the extension will be added
  55. to the list of known extensions.
  56. If strict is true, information will be added to
  57. list of standard types, else to the list of non-standard
  58. types.
  59. """
  60. self.types_map[strict][ext] = type
  61. exts = self.types_map_inv[strict].setdefault(type, [])
  62. if ext not in exts:
  63. exts.append(ext)
  64. def guess_type(self, url, strict=True):
  65. """Guess the type of a file based on its URL.
  66. Return value is a tuple (type, encoding) where type is None if
  67. the type can't be guessed (no or unknown suffix) or a string
  68. of the form type/subtype, usable for a MIME Content-type
  69. header; and encoding is None for no encoding or the name of
  70. the program used to encode (e.g. compress or gzip). The
  71. mappings are table driven. Encoding suffixes are case
  72. sensitive; type suffixes are first tried case sensitive, then
  73. case insensitive.
  74. The suffixes .tgz, .taz and .tz (case sensitive!) are all
  75. mapped to '.tar.gz'. (This is table-driven too, using the
  76. dictionary suffix_map.)
  77. Optional `strict' argument when False adds a bunch of commonly found,
  78. but non-standard types.
  79. """
  80. scheme, url = urllib.splittype(url)
  81. if scheme == 'data':
  82. # syntax of data URLs:
  83. # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
  84. # mediatype := [ type "/" subtype ] *( ";" parameter )
  85. # data := *urlchar
  86. # parameter := attribute "=" value
  87. # type/subtype defaults to "text/plain"
  88. comma = url.find(',')
  89. if comma < 0:
  90. # bad data URL
  91. return None, None
  92. semi = url.find(';', 0, comma)
  93. if semi >= 0:
  94. type = url[:semi]
  95. else:
  96. type = url[:comma]
  97. if '=' in type or '/' not in type:
  98. type = 'text/plain'
  99. return type, None # never compressed, so encoding is None
  100. base, ext = posixpath.splitext(url)
  101. while ext in self.suffix_map:
  102. base, ext = posixpath.splitext(base + self.suffix_map[ext])
  103. if ext in self.encodings_map:
  104. encoding = self.encodings_map[ext]
  105. base, ext = posixpath.splitext(base)
  106. else:
  107. encoding = None
  108. types_map = self.types_map[True]
  109. if ext in types_map:
  110. return types_map[ext], encoding
  111. elif ext.lower() in types_map:
  112. return types_map[ext.lower()], encoding
  113. elif strict:
  114. return None, encoding
  115. types_map = self.types_map[False]
  116. if ext in types_map:
  117. return types_map[ext], encoding
  118. elif ext.lower() in types_map:
  119. return types_map[ext.lower()], encoding
  120. else:
  121. return None, encoding
  122. def guess_all_extensions(self, type, strict=True):
  123. """Guess the extensions for a file based on its MIME type.
  124. Return value is a list of strings giving the possible filename
  125. extensions, including the leading dot ('.'). The extension is not
  126. guaranteed to have been associated with any particular data stream,
  127. but would be mapped to the MIME type `type' by guess_type().
  128. Optional `strict' argument when false adds a bunch of commonly found,
  129. but non-standard types.
  130. """
  131. type = type.lower()
  132. extensions = self.types_map_inv[True].get(type, [])
  133. if not strict:
  134. for ext in self.types_map_inv[False].get(type, []):
  135. if ext not in extensions:
  136. extensions.append(ext)
  137. return extensions
  138. def guess_extension(self, type, strict=True):
  139. """Guess the extension for a file based on its MIME type.
  140. Return value is a string giving a filename extension,
  141. including the leading dot ('.'). The extension is not
  142. guaranteed to have been associated with any particular data
  143. stream, but would be mapped to the MIME type `type' by
  144. guess_type(). If no extension can be guessed for `type', None
  145. is returned.
  146. Optional `strict' argument when false adds a bunch of commonly found,
  147. but non-standard types.
  148. """
  149. extensions = self.guess_all_extensions(type, strict)
  150. if not extensions:
  151. return None
  152. return extensions[0]
  153. def read(self, filename, strict=True):
  154. """
  155. Read a single mime.types-format file, specified by pathname.
  156. If strict is true, information will be added to
  157. list of standard types, else to the list of non-standard
  158. types.
  159. """
  160. fp = open(filename)
  161. self.readfp(fp, strict)
  162. fp.close()
  163. def readfp(self, fp, strict=True):
  164. """
  165. Read a single mime.types-format file.
  166. If strict is true, information will be added to
  167. list of standard types, else to the list of non-standard
  168. types.
  169. """
  170. while 1:
  171. line = fp.readline()
  172. if not line:
  173. break
  174. words = line.split()
  175. for i in range(len(words)):
  176. if words[i][0] == '#':
  177. del words[i:]
  178. break
  179. if not words:
  180. continue
  181. type, suffixes = words[0], words[1:]
  182. for suff in suffixes:
  183. self.add_type(type, '.' + suff, strict)
  184. def guess_type(url, strict=True):
  185. """Guess the type of a file based on its URL.
  186. Return value is a tuple (type, encoding) where type is None if the
  187. type can't be guessed (no or unknown suffix) or a string of the
  188. form type/subtype, usable for a MIME Content-type header; and
  189. encoding is None for no encoding or the name of the program used
  190. to encode (e.g. compress or gzip). The mappings are table
  191. driven. Encoding suffixes are case sensitive; type suffixes are
  192. first tried case sensitive, then case insensitive.
  193. The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
  194. to ".tar.gz". (This is table-driven too, using the dictionary
  195. suffix_map).
  196. Optional `strict' argument when false adds a bunch of commonly found, but
  197. non-standard types.
  198. """
  199. init()
  200. return guess_type(url, strict)
  201. def guess_all_extensions(type, strict=True):
  202. """Guess the extensions for a file based on its MIME type.
  203. Return value is a list of strings giving the possible filename
  204. extensions, including the leading dot ('.'). The extension is not
  205. guaranteed to have been associated with any particular data
  206. stream, but would be mapped to the MIME type `type' by
  207. guess_type(). If no extension can be guessed for `type', None
  208. is returned.
  209. Optional `strict' argument when false adds a bunch of commonly found,
  210. but non-standard types.
  211. """
  212. init()
  213. return guess_all_extensions(type, strict)
  214. def guess_extension(type, strict=True):
  215. """Guess the extension for a file based on its MIME type.
  216. Return value is a string giving a filename extension, including the
  217. leading dot ('.'). The extension is not guaranteed to have been
  218. associated with any particular data stream, but would be mapped to the
  219. MIME type `type' by guess_type(). If no extension can be guessed for
  220. `type', None is returned.
  221. Optional `strict' argument when false adds a bunch of commonly found,
  222. but non-standard types.
  223. """
  224. init()
  225. return guess_extension(type, strict)
  226. def add_type(type, ext, strict=True):
  227. """Add a mapping between a type and an extension.
  228. When the extension is already known, the new
  229. type will replace the old one. When the type
  230. is already known the extension will be added
  231. to the list of known extensions.
  232. If strict is true, information will be added to
  233. list of standard types, else to the list of non-standard
  234. types.
  235. """
  236. init()
  237. return add_type(type, ext, strict)
  238. def init(files=None):
  239. global guess_all_extensions, guess_extension, guess_type
  240. global suffix_map, types_map, encodings_map, common_types
  241. global add_type, inited
  242. inited = True
  243. db = MimeTypes()
  244. if files is None:
  245. files = knownfiles
  246. for file in files:
  247. if os.path.isfile(file):
  248. db.readfp(open(file))
  249. encodings_map = db.encodings_map
  250. suffix_map = db.suffix_map
  251. types_map = db.types_map[True]
  252. guess_all_extensions = db.guess_all_extensions
  253. guess_extension = db.guess_extension
  254. guess_type = db.guess_type
  255. add_type = db.add_type
  256. common_types = db.types_map[False]
  257. def read_mime_types(file):
  258. try:
  259. f = open(file)
  260. except IOError:
  261. return None
  262. db = MimeTypes()
  263. db.readfp(f, True)
  264. return db.types_map[True]
  265. def _default_mime_types():
  266. global suffix_map
  267. global encodings_map
  268. global types_map
  269. global common_types
  270. suffix_map = {
  271. '.tgz': '.tar.gz',
  272. '.taz': '.tar.gz',
  273. '.tz': '.tar.gz',
  274. }
  275. encodings_map = {
  276. '.gz': 'gzip',
  277. '.Z': 'compress',
  278. }
  279. # Before adding new types, make sure they are either registered with IANA,
  280. # at http://www.isi.edu/in-notes/iana/assignments/media-types
  281. # or extensions, i.e. using the x- prefix
  282. # If you add to these, please keep them sorted!
  283. types_map = {
  284. '.a' : 'application/octet-stream',
  285. '.ai' : 'application/postscript',
  286. '.aif' : 'audio/x-aiff',
  287. '.aifc' : 'audio/x-aiff',
  288. '.aiff' : 'audio/x-aiff',
  289. '.au' : 'audio/basic',
  290. '.avi' : 'video/x-msvideo',
  291. '.bat' : 'text/plain',
  292. '.bcpio' : 'application/x-bcpio',
  293. '.bin' : 'application/octet-stream',
  294. '.bmp' : 'image/x-ms-bmp',
  295. '.c' : 'text/plain',
  296. # Duplicates :(
  297. '.cdf' : 'application/x-cdf',
  298. '.cdf' : 'application/x-netcdf',
  299. '.cpio' : 'application/x-cpio',
  300. '.csh' : 'application/x-csh',
  301. '.css' : 'text/css',
  302. '.dll' : 'application/octet-stream',
  303. '.doc' : 'application/msword',
  304. '.dot' : 'application/msword',
  305. '.dvi' : 'application/x-dvi',
  306. '.eml' : 'message/rfc822',
  307. '.eps' : 'application/postscript',
  308. '.etx' : 'text/x-setext',
  309. '.exe' : 'application/octet-stream',
  310. '.gif' : 'image/gif',
  311. '.gtar' : 'application/x-gtar',
  312. '.h' : 'text/plain',
  313. '.hdf' : 'application/x-hdf',
  314. '.htm' : 'text/html',
  315. '.html' : 'text/html',
  316. '.ief' : 'image/ief',
  317. '.jpe' : 'image/jpeg',
  318. '.jpeg' : 'image/jpeg',
  319. '.jpg' : 'image/jpeg',
  320. '.js' : 'application/x-javascript',
  321. '.ksh' : 'text/plain',
  322. '.latex' : 'application/x-latex',
  323. '.m1v' : 'video/mpeg',
  324. '.man' : 'application/x-troff-man',
  325. '.me' : 'application/x-troff-me',
  326. '.mht' : 'message/rfc822',
  327. '.mhtml' : 'message/rfc822',
  328. '.mif' : 'application/x-mif',
  329. '.mov' : 'video/quicktime',
  330. '.movie' : 'video/x-sgi-movie',
  331. '.mp2' : 'audio/mpeg',
  332. '.mp3' : 'audio/mpeg',
  333. '.mpa' : 'video/mpeg',
  334. '.mpe' : 'video/mpeg',
  335. '.mpeg' : 'video/mpeg',
  336. '.mpg' : 'video/mpeg',
  337. '.ms' : 'application/x-troff-ms',
  338. '.nc' : 'application/x-netcdf',
  339. '.nws' : 'message/rfc822',
  340. '.o' : 'application/octet-stream',
  341. '.obj' : 'application/octet-stream',
  342. '.oda' : 'application/oda',
  343. '.p12' : 'application/x-pkcs12',
  344. '.p7c' : 'application/pkcs7-mime',
  345. '.pbm' : 'image/x-portable-bitmap',
  346. '.pdf' : 'application/pdf',
  347. '.pfx' : 'application/x-pkcs12',
  348. '.pgm' : 'image/x-portable-graymap',
  349. '.pl' : 'text/plain',
  350. '.png' : 'image/png',
  351. '.pnm' : 'image/x-portable-anymap',
  352. '.pot' : 'application/vnd.ms-powerpoint',
  353. '.ppa' : 'application/vnd.ms-powerpoint',
  354. '.ppm' : 'image/x-portable-pixmap',
  355. '.pps' : 'application/vnd.ms-powerpoint',
  356. '.ppt' : 'application/vnd.ms-powerpoint',
  357. '.ps' : 'application/postscript',
  358. '.pwz' : 'application/vnd.ms-powerpoint',
  359. '.py' : 'text/x-python',
  360. '.pyc' : 'application/x-python-code',
  361. '.pyo' : 'application/x-python-code',
  362. '.qt' : 'video/quicktime',
  363. '.ra' : 'audio/x-pn-realaudio',
  364. '.ram' : 'application/x-pn-realaudio',
  365. '.ras' : 'image/x-cmu-raster',
  366. '.rdf' : 'application/xml',
  367. '.rgb' : 'image/x-rgb',
  368. '.roff' : 'application/x-troff',
  369. '.rtx' : 'text/richtext',
  370. '.sgm' : 'text/x-sgml',
  371. '.sgml' : 'text/x-sgml',
  372. '.sh' : 'application/x-sh',
  373. '.shar' : 'application/x-shar',
  374. '.snd' : 'audio/basic',
  375. '.so' : 'application/octet-stream',
  376. '.src' : 'application/x-wais-source',
  377. '.sv4cpio': 'application/x-sv4cpio',
  378. '.sv4crc' : 'application/x-sv4crc',
  379. '.swf' : 'application/x-shockwave-flash',
  380. '.t' : 'application/x-troff',
  381. '.tar' : 'application/x-tar',
  382. '.tcl' : 'application/x-tcl',
  383. '.tex' : 'application/x-tex',
  384. '.texi' : 'application/x-texinfo',
  385. '.texinfo': 'application/x-texinfo',
  386. '.tif' : 'image/tiff',
  387. '.tiff' : 'image/tiff',
  388. '.tr' : 'application/x-troff',
  389. '.tsv' : 'text/tab-separated-values',
  390. '.txt' : 'text/plain',
  391. '.ustar' : 'application/x-ustar',
  392. '.vcf' : 'text/x-vcard',
  393. '.wav' : 'audio/x-wav',
  394. '.wiz' : 'application/msword',
  395. '.xbm' : 'image/x-xbitmap',
  396. '.xlb' : 'application/vnd.ms-excel',
  397. # Duplicates :(
  398. '.xls' : 'application/excel',
  399. '.xls' : 'application/vnd.ms-excel',
  400. '.xml' : 'text/xml',
  401. '.xpm' : 'image/x-xpixmap',
  402. '.xsl' : 'application/xml',
  403. '.xwd' : 'image/x-xwindowdump',
  404. '.zip' : 'application/zip',
  405. }
  406. # These are non-standard types, commonly found in the wild. They will only
  407. # match if strict=0 flag is given to the API methods.
  408. # Please sort these too
  409. common_types = {
  410. '.jpg' : 'image/jpg',
  411. '.mid' : 'audio/midi',
  412. '.midi': 'audio/midi',
  413. '.pct' : 'image/pict',
  414. '.pic' : 'image/pict',
  415. '.pict': 'image/pict',
  416. '.rtf' : 'application/rtf',
  417. '.xul' : 'text/xul'
  418. }
  419. _default_mime_types()
  420. if __name__ == '__main__':
  421. import sys
  422. import getopt
  423. USAGE = """\
  424. Usage: mimetypes.py [options] type
  425. Options:
  426. --help / -h -- print this message and exit
  427. --lenient / -l -- additionally search of some common, but non-standard
  428. types.
  429. --extension / -e -- guess extension instead of type
  430. More than one type argument may be given.
  431. """
  432. def usage(code, msg=''):
  433. print USAGE
  434. if msg: print msg
  435. sys.exit(code)
  436. try:
  437. opts, args = getopt.getopt(sys.argv[1:], 'hle',
  438. ['help', 'lenient', 'extension'])
  439. except getopt.error, msg:
  440. usage(1, msg)
  441. strict = 1
  442. extension = 0
  443. for opt, arg in opts:
  444. if opt in ('-h', '--help'):
  445. usage(0)
  446. elif opt in ('-l', '--lenient'):
  447. strict = 0
  448. elif opt in ('-e', '--extension'):
  449. extension = 1
  450. for gtype in args:
  451. if extension:
  452. guess = guess_extension(gtype, strict)
  453. if not guess: print "I don't know anything about type", gtype
  454. else: print guess
  455. else:
  456. guess, encoding = guess_type(gtype, strict)
  457. if not guess: print "I don't know anything about type", gtype
  458. else: print 'type:', guess, 'encoding:', encoding