grid_file.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633
  1. # Copyright 2009-2015 MongoDB, Inc.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tools for representing files stored in GridFS."""
  15. import datetime
  16. import math
  17. import os
  18. from bson.binary import Binary
  19. from bson.objectid import ObjectId
  20. from bson.py3compat import text_type, StringIO
  21. from gridfs.errors import (CorruptGridFile,
  22. FileExists,
  23. NoFile)
  24. from pymongo import ASCENDING
  25. from pymongo.collection import Collection
  26. from pymongo.cursor import Cursor
  27. from pymongo.errors import ConfigurationError, DuplicateKeyError
  28. from pymongo.read_preferences import ReadPreference
  29. try:
  30. _SEEK_SET = os.SEEK_SET
  31. _SEEK_CUR = os.SEEK_CUR
  32. _SEEK_END = os.SEEK_END
  33. # before 2.5
  34. except AttributeError:
  35. _SEEK_SET = 0
  36. _SEEK_CUR = 1
  37. _SEEK_END = 2
  38. EMPTY = b""
  39. NEWLN = b"\n"
  40. """Default chunk size, in bytes."""
  41. # Slightly under a power of 2, to work well with server's record allocations.
  42. DEFAULT_CHUNK_SIZE = 255 * 1024
  43. def _grid_in_property(field_name, docstring, read_only=False,
  44. closed_only=False):
  45. """Create a GridIn property."""
  46. def getter(self):
  47. if closed_only and not self._closed:
  48. raise AttributeError("can only get %r on a closed file" %
  49. field_name)
  50. # Protect against PHP-237
  51. if field_name == 'length':
  52. return self._file.get(field_name, 0)
  53. return self._file.get(field_name, None)
  54. def setter(self, value):
  55. if self._closed:
  56. self._coll.files.update_one({"_id": self._file["_id"]},
  57. {"$set": {field_name: value}})
  58. self._file[field_name] = value
  59. if read_only:
  60. docstring += "\n\nThis attribute is read-only."
  61. elif closed_only:
  62. docstring = "%s\n\n%s" % (docstring, "This attribute is read-only and "
  63. "can only be read after :meth:`close` "
  64. "has been called.")
  65. if not read_only and not closed_only:
  66. return property(getter, setter, doc=docstring)
  67. return property(getter, doc=docstring)
  68. def _grid_out_property(field_name, docstring):
  69. """Create a GridOut property."""
  70. def getter(self):
  71. self._ensure_file()
  72. # Protect against PHP-237
  73. if field_name == 'length':
  74. return self._file.get(field_name, 0)
  75. return self._file.get(field_name, None)
  76. docstring += "\n\nThis attribute is read-only."
  77. return property(getter, doc=docstring)
  78. class GridIn(object):
  79. """Class to write data to GridFS.
  80. """
  81. def __init__(self, root_collection, **kwargs):
  82. """Write a file to GridFS
  83. Application developers should generally not need to
  84. instantiate this class directly - instead see the methods
  85. provided by :class:`~gridfs.GridFS`.
  86. Raises :class:`TypeError` if `root_collection` is not an
  87. instance of :class:`~pymongo.collection.Collection`.
  88. Any of the file level options specified in the `GridFS Spec
  89. <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as
  90. keyword arguments. Any additional keyword arguments will be
  91. set as additional fields on the file document. Valid keyword
  92. arguments include:
  93. - ``"_id"``: unique ID for this file (default:
  94. :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must
  95. not have already been used for another file
  96. - ``"filename"``: human name for the file
  97. - ``"contentType"`` or ``"content_type"``: valid mime-type
  98. for the file
  99. - ``"chunkSize"`` or ``"chunk_size"``: size of each of the
  100. chunks, in bytes (default: 255 kb)
  101. - ``"encoding"``: encoding used for this file. In Python 2,
  102. any :class:`unicode` that is written to the file will be
  103. converted to a :class:`str`. In Python 3, any :class:`str`
  104. that is written to the file will be converted to
  105. :class:`bytes`.
  106. :Parameters:
  107. - `root_collection`: root collection to write to
  108. - `**kwargs` (optional): file level options (see above)
  109. .. versionchanged:: 3.0
  110. `root_collection` must use an acknowledged
  111. :attr:`~pymongo.collection.Collection.write_concern`
  112. """
  113. if not isinstance(root_collection, Collection):
  114. raise TypeError("root_collection must be an "
  115. "instance of Collection")
  116. # With w=0, 'filemd5' might run before the final chunks are written.
  117. if not root_collection.write_concern.acknowledged:
  118. raise ConfigurationError('root_collection must use '
  119. 'acknowledged write_concern')
  120. # Handle alternative naming
  121. if "content_type" in kwargs:
  122. kwargs["contentType"] = kwargs.pop("content_type")
  123. if "chunk_size" in kwargs:
  124. kwargs["chunkSize"] = kwargs.pop("chunk_size")
  125. # Defaults
  126. kwargs["_id"] = kwargs.get("_id", ObjectId())
  127. kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE)
  128. object.__setattr__(self, "_coll", root_collection)
  129. object.__setattr__(self, "_chunks", root_collection.chunks)
  130. object.__setattr__(self, "_file", kwargs)
  131. object.__setattr__(self, "_buffer", StringIO())
  132. object.__setattr__(self, "_position", 0)
  133. object.__setattr__(self, "_chunk_number", 0)
  134. object.__setattr__(self, "_closed", False)
  135. object.__setattr__(self, "_ensured_index", False)
  136. def _ensure_index(self):
  137. if not object.__getattribute__(self, "_ensured_index"):
  138. self._coll.chunks.create_index(
  139. [("files_id", ASCENDING), ("n", ASCENDING)],
  140. unique=True)
  141. object.__setattr__(self, "_ensured_index", True)
  142. @property
  143. def closed(self):
  144. """Is this file closed?
  145. """
  146. return self._closed
  147. _id = _grid_in_property("_id", "The ``'_id'`` value for this file.",
  148. read_only=True)
  149. filename = _grid_in_property("filename", "Name of this file.")
  150. name = _grid_in_property("filename", "Alias for `filename`.")
  151. content_type = _grid_in_property("contentType", "Mime-type for this file.")
  152. length = _grid_in_property("length", "Length (in bytes) of this file.",
  153. closed_only=True)
  154. chunk_size = _grid_in_property("chunkSize", "Chunk size for this file.",
  155. read_only=True)
  156. upload_date = _grid_in_property("uploadDate",
  157. "Date that this file was uploaded.",
  158. closed_only=True)
  159. md5 = _grid_in_property("md5", "MD5 of the contents of this file "
  160. "(generated on the server).",
  161. closed_only=True)
  162. def __getattr__(self, name):
  163. if name in self._file:
  164. return self._file[name]
  165. raise AttributeError("GridIn object has no attribute '%s'" % name)
  166. def __setattr__(self, name, value):
  167. # For properties of this instance like _buffer, or descriptors set on
  168. # the class like filename, use regular __setattr__
  169. if name in self.__dict__ or name in self.__class__.__dict__:
  170. object.__setattr__(self, name, value)
  171. else:
  172. # All other attributes are part of the document in db.fs.files.
  173. # Store them to be sent to server on close() or if closed, send
  174. # them now.
  175. self._file[name] = value
  176. if self._closed:
  177. self._coll.files.update_one({"_id": self._file["_id"]},
  178. {"$set": {name: value}})
  179. def __flush_data(self, data):
  180. """Flush `data` to a chunk.
  181. """
  182. # Ensure the index, even if there's nothing to write, so
  183. # the filemd5 command always succeeds.
  184. self._ensure_index()
  185. if not data:
  186. return
  187. assert(len(data) <= self.chunk_size)
  188. chunk = {"files_id": self._file["_id"],
  189. "n": self._chunk_number,
  190. "data": Binary(data)}
  191. try:
  192. self._chunks.insert_one(chunk)
  193. except DuplicateKeyError:
  194. self._raise_file_exists(self._file['_id'])
  195. self._chunk_number += 1
  196. self._position += len(data)
  197. def __flush_buffer(self):
  198. """Flush the buffer contents out to a chunk.
  199. """
  200. self.__flush_data(self._buffer.getvalue())
  201. self._buffer.close()
  202. self._buffer = StringIO()
  203. def __flush(self):
  204. """Flush the file to the database.
  205. """
  206. try:
  207. self.__flush_buffer()
  208. db = self._coll.database
  209. md5 = db.command(
  210. "filemd5", self._id, root=self._coll.name,
  211. read_preference=ReadPreference.PRIMARY)["md5"]
  212. self._file["md5"] = md5
  213. self._file["length"] = self._position
  214. self._file["uploadDate"] = datetime.datetime.utcnow()
  215. return self._coll.files.insert_one(self._file)
  216. except DuplicateKeyError:
  217. self._raise_file_exists(self._id)
  218. def _raise_file_exists(self, file_id):
  219. """Raise a FileExists exception for the given file_id."""
  220. raise FileExists("file with _id %r already exists" % file_id)
  221. def close(self):
  222. """Flush the file and close it.
  223. A closed file cannot be written any more. Calling
  224. :meth:`close` more than once is allowed.
  225. """
  226. if not self._closed:
  227. self.__flush()
  228. object.__setattr__(self, "_closed", True)
  229. def write(self, data):
  230. """Write data to the file. There is no return value.
  231. `data` can be either a string of bytes or a file-like object
  232. (implementing :meth:`read`). If the file has an
  233. :attr:`encoding` attribute, `data` can also be a
  234. :class:`unicode` (:class:`str` in python 3) instance, which
  235. will be encoded as :attr:`encoding` before being written.
  236. Due to buffering, the data may not actually be written to the
  237. database until the :meth:`close` method is called. Raises
  238. :class:`ValueError` if this file is already closed. Raises
  239. :class:`TypeError` if `data` is not an instance of
  240. :class:`str` (:class:`bytes` in python 3), a file-like object,
  241. or an instance of :class:`unicode` (:class:`str` in python 3).
  242. Unicode data is only allowed if the file has an :attr:`encoding`
  243. attribute.
  244. :Parameters:
  245. - `data`: string of bytes or file-like object to be written
  246. to the file
  247. """
  248. if self._closed:
  249. raise ValueError("cannot write to a closed file")
  250. try:
  251. # file-like
  252. read = data.read
  253. except AttributeError:
  254. # string
  255. if not isinstance(data, (text_type, bytes)):
  256. raise TypeError("can only write strings or file-like objects")
  257. if isinstance(data, text_type):
  258. try:
  259. data = data.encode(self.encoding)
  260. except AttributeError:
  261. raise TypeError("must specify an encoding for file in "
  262. "order to write %s" % (text_type.__name__,))
  263. read = StringIO(data).read
  264. if self._buffer.tell() > 0:
  265. # Make sure to flush only when _buffer is complete
  266. space = self.chunk_size - self._buffer.tell()
  267. if space:
  268. to_write = read(space)
  269. self._buffer.write(to_write)
  270. if len(to_write) < space:
  271. return # EOF or incomplete
  272. self.__flush_buffer()
  273. to_write = read(self.chunk_size)
  274. while to_write and len(to_write) == self.chunk_size:
  275. self.__flush_data(to_write)
  276. to_write = read(self.chunk_size)
  277. self._buffer.write(to_write)
  278. def writelines(self, sequence):
  279. """Write a sequence of strings to the file.
  280. Does not add seperators.
  281. """
  282. for line in sequence:
  283. self.write(line)
  284. def __enter__(self):
  285. """Support for the context manager protocol.
  286. """
  287. return self
  288. def __exit__(self, exc_type, exc_val, exc_tb):
  289. """Support for the context manager protocol.
  290. Close the file and allow exceptions to propagate.
  291. """
  292. self.close()
  293. # propagate exceptions
  294. return False
  295. class GridOut(object):
  296. """Class to read data out of GridFS.
  297. """
  298. def __init__(self, root_collection, file_id=None, file_document=None):
  299. """Read a file from GridFS
  300. Application developers should generally not need to
  301. instantiate this class directly - instead see the methods
  302. provided by :class:`~gridfs.GridFS`.
  303. Either `file_id` or `file_document` must be specified,
  304. `file_document` will be given priority if present. Raises
  305. :class:`TypeError` if `root_collection` is not an instance of
  306. :class:`~pymongo.collection.Collection`.
  307. :Parameters:
  308. - `root_collection`: root collection to read from
  309. - `file_id` (optional): value of ``"_id"`` for the file to read
  310. - `file_document` (optional): file document from
  311. `root_collection.files`
  312. .. versionchanged:: 3.0
  313. Creating a GridOut does not immediately retrieve the file metadata
  314. from the server. Metadata is fetched when first needed.
  315. """
  316. if not isinstance(root_collection, Collection):
  317. raise TypeError("root_collection must be an "
  318. "instance of Collection")
  319. self.__chunks = root_collection.chunks
  320. self.__files = root_collection.files
  321. self.__file_id = file_id
  322. self.__buffer = EMPTY
  323. self.__position = 0
  324. self._file = file_document
  325. _id = _grid_out_property("_id", "The ``'_id'`` value for this file.")
  326. filename = _grid_out_property("filename", "Name of this file.")
  327. name = _grid_out_property("filename", "Alias for `filename`.")
  328. content_type = _grid_out_property("contentType", "Mime-type for this file.")
  329. length = _grid_out_property("length", "Length (in bytes) of this file.")
  330. chunk_size = _grid_out_property("chunkSize", "Chunk size for this file.")
  331. upload_date = _grid_out_property("uploadDate",
  332. "Date that this file was first uploaded.")
  333. aliases = _grid_out_property("aliases", "List of aliases for this file.")
  334. metadata = _grid_out_property("metadata", "Metadata attached to this file.")
  335. md5 = _grid_out_property("md5", "MD5 of the contents of this file "
  336. "(generated on the server).")
  337. def _ensure_file(self):
  338. if not self._file:
  339. self._file = self.__files.find_one({"_id": self.__file_id})
  340. if not self._file:
  341. raise NoFile("no file in gridfs collection %r with _id %r" %
  342. (self.__files, self.__file_id))
  343. def __getattr__(self, name):
  344. self._ensure_file()
  345. if name in self._file:
  346. return self._file[name]
  347. raise AttributeError("GridOut object has no attribute '%s'" % name)
  348. def readchunk(self):
  349. """Reads a chunk at a time. If the current position is within a
  350. chunk the remainder of the chunk is returned.
  351. """
  352. received = len(self.__buffer)
  353. chunk_data = EMPTY
  354. chunk_size = int(self.chunk_size)
  355. if received > 0:
  356. chunk_data = self.__buffer
  357. elif self.__position < int(self.length):
  358. chunk_number = int((received + self.__position) / chunk_size)
  359. chunk = self.__chunks.find_one({"files_id": self._id,
  360. "n": chunk_number})
  361. if not chunk:
  362. raise CorruptGridFile("no chunk #%d" % chunk_number)
  363. chunk_data = chunk["data"][self.__position % chunk_size:]
  364. self.__position += len(chunk_data)
  365. self.__buffer = EMPTY
  366. return chunk_data
  367. def read(self, size=-1):
  368. """Read at most `size` bytes from the file (less if there
  369. isn't enough data).
  370. The bytes are returned as an instance of :class:`str` (:class:`bytes`
  371. in python 3). If `size` is negative or omitted all data is read.
  372. :Parameters:
  373. - `size` (optional): the number of bytes to read
  374. """
  375. self._ensure_file()
  376. if size == 0:
  377. return EMPTY
  378. remainder = int(self.length) - self.__position
  379. if size < 0 or size > remainder:
  380. size = remainder
  381. received = 0
  382. data = StringIO()
  383. while received < size:
  384. chunk_data = self.readchunk()
  385. received += len(chunk_data)
  386. data.write(chunk_data)
  387. self.__position -= received - size
  388. # Return 'size' bytes and store the rest.
  389. data.seek(size)
  390. self.__buffer = data.read()
  391. data.seek(0)
  392. return data.read(size)
  393. def readline(self, size=-1):
  394. """Read one line or up to `size` bytes from the file.
  395. :Parameters:
  396. - `size` (optional): the maximum number of bytes to read
  397. """
  398. if size == 0:
  399. return b''
  400. remainder = int(self.length) - self.__position
  401. if size < 0 or size > remainder:
  402. size = remainder
  403. received = 0
  404. data = StringIO()
  405. while received < size:
  406. chunk_data = self.readchunk()
  407. pos = chunk_data.find(NEWLN, 0, size)
  408. if pos != -1:
  409. size = received + pos + 1
  410. received += len(chunk_data)
  411. data.write(chunk_data)
  412. if pos != -1:
  413. break
  414. self.__position -= received - size
  415. # Return 'size' bytes and store the rest.
  416. data.seek(size)
  417. self.__buffer = data.read()
  418. data.seek(0)
  419. return data.read(size)
  420. def tell(self):
  421. """Return the current position of this file.
  422. """
  423. return self.__position
  424. def seek(self, pos, whence=_SEEK_SET):
  425. """Set the current position of this file.
  426. :Parameters:
  427. - `pos`: the position (or offset if using relative
  428. positioning) to seek to
  429. - `whence` (optional): where to seek
  430. from. :attr:`os.SEEK_SET` (``0``) for absolute file
  431. positioning, :attr:`os.SEEK_CUR` (``1``) to seek relative
  432. to the current position, :attr:`os.SEEK_END` (``2``) to
  433. seek relative to the file's end.
  434. """
  435. if whence == _SEEK_SET:
  436. new_pos = pos
  437. elif whence == _SEEK_CUR:
  438. new_pos = self.__position + pos
  439. elif whence == _SEEK_END:
  440. new_pos = int(self.length) + pos
  441. else:
  442. raise IOError(22, "Invalid value for `whence`")
  443. if new_pos < 0:
  444. raise IOError(22, "Invalid value for `pos` - must be positive")
  445. self.__position = new_pos
  446. self.__buffer = EMPTY
  447. def __iter__(self):
  448. """Return an iterator over all of this file's data.
  449. The iterator will return chunk-sized instances of
  450. :class:`str` (:class:`bytes` in python 3). This can be
  451. useful when serving files using a webserver that handles
  452. such an iterator efficiently.
  453. """
  454. return GridOutIterator(self, self.__chunks)
  455. def close(self):
  456. """Make GridOut more generically file-like."""
  457. pass
  458. def __enter__(self):
  459. """Makes it possible to use :class:`GridOut` files
  460. with the context manager protocol.
  461. """
  462. return self
  463. def __exit__(self, exc_type, exc_val, exc_tb):
  464. """Makes it possible to use :class:`GridOut` files
  465. with the context manager protocol.
  466. """
  467. return False
  468. class GridOutIterator(object):
  469. def __init__(self, grid_out, chunks):
  470. self.__id = grid_out._id
  471. self.__chunks = chunks
  472. self.__current_chunk = 0
  473. self.__max_chunk = math.ceil(float(grid_out.length) /
  474. grid_out.chunk_size)
  475. def __iter__(self):
  476. return self
  477. def next(self):
  478. if self.__current_chunk >= self.__max_chunk:
  479. raise StopIteration
  480. chunk = self.__chunks.find_one({"files_id": self.__id,
  481. "n": self.__current_chunk})
  482. if not chunk:
  483. raise CorruptGridFile("no chunk #%d" % self.__current_chunk)
  484. self.__current_chunk += 1
  485. return bytes(chunk["data"])
  486. __next__ = next
  487. class GridOutCursor(Cursor):
  488. """A cursor / iterator for returning GridOut objects as the result
  489. of an arbitrary query against the GridFS files collection.
  490. """
  491. def __init__(self, collection, filter=None, skip=0, limit=0,
  492. no_cursor_timeout=False, sort=None):
  493. """Create a new cursor, similar to the normal
  494. :class:`~pymongo.cursor.Cursor`.
  495. Should not be called directly by application developers - see
  496. the :class:`~gridfs.GridFS` method :meth:`~gridfs.GridFS.find` instead.
  497. .. versionadded 2.7
  498. .. mongodoc:: cursors
  499. """
  500. # Hold on to the base "fs" collection to create GridOut objects later.
  501. self.__root_collection = collection
  502. super(GridOutCursor, self).__init__(
  503. collection.files, filter, skip=skip, limit=limit,
  504. no_cursor_timeout=no_cursor_timeout, sort=sort)
  505. def next(self):
  506. """Get next GridOut object from cursor.
  507. """
  508. # Work around "super is not iterable" issue in Python 3.x
  509. next_file = super(GridOutCursor, self).next()
  510. return GridOut(self.__root_collection, file_document=next_file)
  511. __next__ = next
  512. def add_option(self, *args, **kwargs):
  513. raise NotImplementedError("Method does not exist for GridOutCursor")
  514. def remove_option(self, *args, **kwargs):
  515. raise NotImplementedError("Method does not exist for GridOutCursor")
  516. def _clone_base(self):
  517. """Creates an empty GridOutCursor for information to be copied into.
  518. """
  519. return GridOutCursor(self.__root_collection)