summaryrefslogtreecommitdiffstats
path: root/webapp/django/http/multipartparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'webapp/django/http/multipartparser.py')
-rw-r--r--webapp/django/http/multipartparser.py639
1 files changed, 639 insertions, 0 deletions
diff --git a/webapp/django/http/multipartparser.py b/webapp/django/http/multipartparser.py
new file mode 100644
index 0000000000..2049289a0b
--- /dev/null
+++ b/webapp/django/http/multipartparser.py
@@ -0,0 +1,639 @@
+"""
+Multi-part parsing for file uploads.
+
+Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
+file upload handlers for processing.
+"""
+
+import cgi
+from django.conf import settings
+from django.core.exceptions import SuspiciousOperation
+from django.utils.datastructures import MultiValueDict
+from django.utils.encoding import force_unicode
+from django.utils.text import unescape_entities
+from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
+
+__all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
+
+class MultiPartParserError(Exception):
+ pass
+
+class InputStreamExhausted(Exception):
+ """
+ No more reads are allowed from this device.
+ """
+ pass
+
+RAW = "raw"
+FILE = "file"
+FIELD = "field"
+
+class MultiPartParser(object):
+ """
+ A rfc2388 multipart/form-data parser.
+
+ ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
+ and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
+ ``file_upload_dir`` is defined files will be streamed to temporary files in
+ that directory.
+ """
+ def __init__(self, META, input_data, upload_handlers, encoding=None):
+ """
+ Initialize the MultiPartParser object.
+
+ :META:
+ The standard ``META`` dictionary in Django request objects.
+ :input_data:
+ The raw post data, as a bytestring.
+ :upload_handler:
+ An UploadHandler instance that performs operations on the uploaded
+ data.
+ :encoding:
+ The encoding with which to treat the incoming data.
+ """
+
+ #
+ # Content-Type should containt multipart and the boundary information.
+ #
+
+ content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
+ if not content_type.startswith('multipart/'):
+ raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
+
+ # Parse the header to get the boundary to split the parts.
+ ctypes, opts = parse_header(content_type)
+ boundary = opts.get('boundary')
+ if not boundary or not cgi.valid_boundary(boundary):
+ raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
+
+
+ #
+ # Content-Length should contain the length of the body we are about
+ # to receive.
+ #
+ try:
+ content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
+ except (ValueError, TypeError):
+ # For now set it to 0; we'll try again later on down.
+ content_length = 0
+
+ if content_length <= 0:
+ # This means we shouldn't continue...raise an error.
+ raise MultiPartParserError("Invalid content length: %r" % content_length)
+
+ self._boundary = boundary
+ self._input_data = input_data
+
+ # For compatibility with low-level network APIs (with 32-bit integers),
+ # the chunk size should be < 2^31, but still divisible by 4.
+ self._chunk_size = min(2**31-4, *[x.chunk_size for x in upload_handlers if x.chunk_size])
+
+ self._meta = META
+ self._encoding = encoding or settings.DEFAULT_CHARSET
+ self._content_length = content_length
+ self._upload_handlers = upload_handlers
+
+ def parse(self):
+ """
+ Parse the POST data and break it into a FILES MultiValueDict and a POST
+ MultiValueDict.
+
+ Returns a tuple containing the POST and FILES dictionary, respectively.
+ """
+ # We have to import QueryDict down here to avoid a circular import.
+ from django.http import QueryDict
+
+ encoding = self._encoding
+ handlers = self._upload_handlers
+
+ limited_input_data = LimitBytes(self._input_data, self._content_length)
+
+ # See if the handler will want to take care of the parsing.
+ # This allows overriding everything if somebody wants it.
+ for handler in handlers:
+ result = handler.handle_raw_input(limited_input_data,
+ self._meta,
+ self._content_length,
+ self._boundary,
+ encoding)
+ if result is not None:
+ return result[0], result[1]
+
+ # Create the data structures to be used later.
+ self._post = QueryDict('', mutable=True)
+ self._files = MultiValueDict()
+
+ # Instantiate the parser and stream:
+ stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
+
+ # Whether or not to signal a file-completion at the beginning of the loop.
+ old_field_name = None
+ counters = [0] * len(handlers)
+
+ try:
+ for item_type, meta_data, field_stream in Parser(stream, self._boundary):
+ if old_field_name:
+ # We run this at the beginning of the next loop
+ # since we cannot be sure a file is complete until
+ # we hit the next boundary/part of the multipart content.
+ self.handle_file_complete(old_field_name, counters)
+ old_field_name = None
+
+ try:
+ disposition = meta_data['content-disposition'][1]
+ field_name = disposition['name'].strip()
+ except (KeyError, IndexError, AttributeError):
+ continue
+
+ transfer_encoding = meta_data.get('content-transfer-encoding')
+ field_name = force_unicode(field_name, encoding, errors='replace')
+
+ if item_type == FIELD:
+ # This is a post field, we can just set it in the post
+ if transfer_encoding == 'base64':
+ raw_data = field_stream.read()
+ try:
+ data = str(raw_data).decode('base64')
+ except:
+ data = raw_data
+ else:
+ data = field_stream.read()
+
+ self._post.appendlist(field_name,
+ force_unicode(data, encoding, errors='replace'))
+ elif item_type == FILE:
+ # This is a file, use the handler...
+ file_name = disposition.get('filename')
+ if not file_name:
+ continue
+ file_name = force_unicode(file_name, encoding, errors='replace')
+ file_name = self.IE_sanitize(unescape_entities(file_name))
+
+ content_type = meta_data.get('content-type', ('',))[0].strip()
+ try:
+ charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
+ except:
+ charset = None
+
+ try:
+ content_length = int(meta_data.get('content-length')[0])
+ except (IndexError, TypeError, ValueError):
+ content_length = None
+
+ counters = [0] * len(handlers)
+ try:
+ for handler in handlers:
+ try:
+ handler.new_file(field_name, file_name,
+ content_type, content_length,
+ charset)
+ except StopFutureHandlers:
+ break
+
+ for chunk in field_stream:
+ if transfer_encoding == 'base64':
+ # We only special-case base64 transfer encoding
+ try:
+ chunk = str(chunk).decode('base64')
+ except Exception, e:
+ # Since this is only a chunk, any error is an unfixable error.
+ raise MultiPartParserError("Could not decode base64 data: %r" % e)
+
+ for i, handler in enumerate(handlers):
+ chunk_length = len(chunk)
+ chunk = handler.receive_data_chunk(chunk,
+ counters[i])
+ counters[i] += chunk_length
+ if chunk is None:
+ # If the chunk received by the handler is None, then don't continue.
+ break
+
+ except SkipFile, e:
+ # Just use up the rest of this file...
+ exhaust(field_stream)
+ else:
+ # Handle file upload completions on next iteration.
+ old_field_name = field_name
+ else:
+ # If this is neither a FIELD or a FILE, just exhaust the stream.
+ exhaust(stream)
+ except StopUpload, e:
+ if not e.connection_reset:
+ exhaust(limited_input_data)
+ else:
+ # Make sure that the request data is all fed
+ exhaust(limited_input_data)
+
+ # Signal that the upload has completed.
+ for handler in handlers:
+ retval = handler.upload_complete()
+ if retval:
+ break
+
+ return self._post, self._files
+
+ def handle_file_complete(self, old_field_name, counters):
+ """
+ Handle all the signalling that takes place when a file is complete.
+ """
+ for i, handler in enumerate(self._upload_handlers):
+ file_obj = handler.file_complete(counters[i])
+ if file_obj:
+ # If it returns a file object, then set the files dict.
+ self._files.appendlist(force_unicode(old_field_name,
+ self._encoding,
+ errors='replace'),
+ file_obj)
+ break
+
+ def IE_sanitize(self, filename):
+ """Cleanup filename from Internet Explorer full paths."""
+ return filename and filename[filename.rfind("\\")+1:].strip()
+
+class LazyStream(object):
+ """
+ The LazyStream wrapper allows one to get and "unget" bytes from a stream.
+
+ Given a producer object (an iterator that yields bytestrings), the
+ LazyStream object will support iteration, reading, and keeping a "look-back"
+ variable in case you need to "unget" some bytes.
+ """
+ def __init__(self, producer, length=None):
+ """
+ Every LazyStream must have a producer when instantiated.
+
+ A producer is an iterable that returns a string each time it
+ is called.
+ """
+ self._producer = producer
+ self._empty = False
+ self._leftover = ''
+ self.length = length
+ self.position = 0
+ self._remaining = length
+ self._unget_history = []
+
+ def tell(self):
+ return self.position
+
+ def read(self, size=None):
+ def parts():
+ remaining = (size is not None and [size] or [self._remaining])[0]
+ # do the whole thing in one shot if no limit was provided.
+ if remaining is None:
+ yield ''.join(self)
+ return
+
+ # otherwise do some bookkeeping to return exactly enough
+ # of the stream and stashing any extra content we get from
+ # the producer
+ while remaining != 0:
+ assert remaining > 0, 'remaining bytes to read should never go negative'
+
+ chunk = self.next()
+
+ emitting = chunk[:remaining]
+ self.unget(chunk[remaining:])
+ remaining -= len(emitting)
+ yield emitting
+
+ out = ''.join(parts())
+ return out
+
+ def next(self):
+ """
+ Used when the exact number of bytes to read is unimportant.
+
+ This procedure just returns whatever is chunk is conveniently returned
+ from the iterator instead. Useful to avoid unnecessary bookkeeping if
+ performance is an issue.
+ """
+ if self._leftover:
+ output = self._leftover
+ self._leftover = ''
+ else:
+ output = self._producer.next()
+ self._unget_history = []
+ self.position += len(output)
+ return output
+
+ def close(self):
+ """
+ Used to invalidate/disable this lazy stream.
+
+ Replaces the producer with an empty list. Any leftover bytes that have
+ already been read will still be reported upon read() and/or next().
+ """
+ self._producer = []
+
+ def __iter__(self):
+ return self
+
+ def unget(self, bytes):
+ """
+ Places bytes back onto the front of the lazy stream.
+
+ Future calls to read() will return those bytes first. The
+ stream position and thus tell() will be rewound.
+ """
+ if not bytes:
+ return
+ self._update_unget_history(len(bytes))
+ self.position -= len(bytes)
+ self._leftover = ''.join([bytes, self._leftover])
+
+ def _update_unget_history(self, num_bytes):
+ """
+ Updates the unget history as a sanity check to see if we've pushed
+ back the same number of bytes in one chunk. If we keep ungetting the
+ same number of bytes many times (here, 50), we're mostly likely in an
+ infinite loop of some sort. This is usually caused by a
+ maliciously-malformed MIME request.
+ """
+ self._unget_history = [num_bytes] + self._unget_history[:49]
+ number_equal = len([current_number for current_number in self._unget_history
+ if current_number == num_bytes])
+
+ if number_equal > 40:
+ raise SuspiciousOperation(
+ "The multipart parser got stuck, which shouldn't happen with"
+ " normal uploaded files. Check for malicious upload activity;"
+ " if there is none, report this to the Django developers."
+ )
+
+class ChunkIter(object):
+ """
+ An iterable that will yield chunks of data. Given a file-like object as the
+ constructor, this object will yield chunks of read operations from that
+ object.
+ """
+ def __init__(self, flo, chunk_size=64 * 1024):
+ self.flo = flo
+ self.chunk_size = chunk_size
+
+ def next(self):
+ try:
+ data = self.flo.read(self.chunk_size)
+ except InputStreamExhausted:
+ raise StopIteration()
+ if data:
+ return data
+ else:
+ raise StopIteration()
+
+ def __iter__(self):
+ return self
+
+class LimitBytes(object):
+ """ Limit bytes for a file object. """
+ def __init__(self, fileobject, length):
+ self._file = fileobject
+ self.remaining = length
+
+ def read(self, num_bytes=None):
+ """
+ Read data from the underlying file.
+ If you ask for too much or there isn't anything left,
+ this will raise an InputStreamExhausted error.
+ """
+ if self.remaining <= 0:
+ raise InputStreamExhausted()
+ if num_bytes is None:
+ num_bytes = self.remaining
+ else:
+ num_bytes = min(num_bytes, self.remaining)
+ self.remaining -= num_bytes
+ return self._file.read(num_bytes)
+
+class InterBoundaryIter(object):
+ """
+ A Producer that will iterate over boundaries.
+ """
+ def __init__(self, stream, boundary):
+ self._stream = stream
+ self._boundary = boundary
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ try:
+ return LazyStream(BoundaryIter(self._stream, self._boundary))
+ except InputStreamExhausted:
+ raise StopIteration()
+
+class BoundaryIter(object):
+ """
+ A Producer that is sensitive to boundaries.
+
+ Will happily yield bytes until a boundary is found. Will yield the bytes
+ before the boundary, throw away the boundary bytes themselves, and push the
+ post-boundary bytes back on the stream.
+
+ The future calls to .next() after locating the boundary will raise a
+ StopIteration exception.
+ """
+
+ def __init__(self, stream, boundary):
+ self._stream = stream
+ self._boundary = boundary
+ self._done = False
+ # rollback an additional six bytes because the format is like
+ # this: CRLF<boundary>[--CRLF]
+ self._rollback = len(boundary) + 6
+
+ # Try to use mx fast string search if available. Otherwise
+ # use Python find. Wrap the latter for consistency.
+ unused_char = self._stream.read(1)
+ if not unused_char:
+ raise InputStreamExhausted()
+ self._stream.unget(unused_char)
+ try:
+ from mx.TextTools import FS
+ self._fs = FS(boundary).find
+ except ImportError:
+ self._fs = lambda data: data.find(boundary)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ if self._done:
+ raise StopIteration()
+
+ stream = self._stream
+ rollback = self._rollback
+
+ bytes_read = 0
+ chunks = []
+ for bytes in stream:
+ bytes_read += len(bytes)
+ chunks.append(bytes)
+ if bytes_read > rollback:
+ break
+ if not bytes:
+ break
+ else:
+ self._done = True
+
+ if not chunks:
+ raise StopIteration()
+
+ chunk = ''.join(chunks)
+ boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
+
+ if boundary:
+ end, next = boundary
+ stream.unget(chunk[next:])
+ self._done = True
+ return chunk[:end]
+ else:
+ # make sure we dont treat a partial boundary (and
+ # its separators) as data
+ if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
+ # There's nothing left, we should just return and mark as done.
+ self._done = True
+ return chunk
+ else:
+ stream.unget(chunk[-rollback:])
+ return chunk[:-rollback]
+
+ def _find_boundary(self, data, eof = False):
+ """
+ Finds a multipart boundary in data.
+
+ Should no boundry exist in the data None is returned instead. Otherwise
+ a tuple containing the indices of the following are returned:
+
+ * the end of current encapsulation
+ * the start of the next encapsulation
+ """
+ index = self._fs(data)
+ if index < 0:
+ return None
+ else:
+ end = index
+ next = index + len(self._boundary)
+ # backup over CRLF
+ if data[max(0,end-1)] == '\n':
+ end -= 1
+ if data[max(0,end-1)] == '\r':
+ end -= 1
+ return end, next
+
+def exhaust(stream_or_iterable):
+ """
+ Completely exhausts an iterator or stream.
+
+ Raise a MultiPartParserError if the argument is not a stream or an iterable.
+ """
+ iterator = None
+ try:
+ iterator = iter(stream_or_iterable)
+ except TypeError:
+ iterator = ChunkIter(stream_or_iterable, 16384)
+
+ if iterator is None:
+ raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
+
+ for __ in iterator:
+ pass
+
+def parse_boundary_stream(stream, max_header_size):
+ """
+ Parses one and exactly one stream that encapsulates a boundary.
+ """
+ # Stream at beginning of header, look for end of header
+ # and parse it if found. The header must fit within one
+ # chunk.
+ chunk = stream.read(max_header_size)
+
+ # 'find' returns the top of these four bytes, so we'll
+ # need to munch them later to prevent them from polluting
+ # the payload.
+ header_end = chunk.find('\r\n\r\n')
+
+ def _parse_header(line):
+ main_value_pair, params = parse_header(line)
+ try:
+ name, value = main_value_pair.split(':', 1)
+ except:
+ raise ValueError("Invalid header: %r" % line)
+ return name, (value, params)
+
+ if header_end == -1:
+ # we find no header, so we just mark this fact and pass on
+ # the stream verbatim
+ stream.unget(chunk)
+ return (RAW, {}, stream)
+
+ header = chunk[:header_end]
+
+ # here we place any excess chunk back onto the stream, as
+ # well as throwing away the CRLFCRLF bytes from above.
+ stream.unget(chunk[header_end + 4:])
+
+ TYPE = RAW
+ outdict = {}
+
+ # Eliminate blank lines
+ for line in header.split('\r\n'):
+ # This terminology ("main value" and "dictionary of
+ # parameters") is from the Python docs.
+ try:
+ name, (value, params) = _parse_header(line)
+ except:
+ continue
+
+ if name == 'content-disposition':
+ TYPE = FIELD
+ if params.get('filename'):
+ TYPE = FILE
+
+ outdict[name] = value, params
+
+ if TYPE == RAW:
+ stream.unget(chunk)
+
+ return (TYPE, outdict, stream)
+
+class Parser(object):
+ def __init__(self, stream, boundary):
+ self._stream = stream
+ self._separator = '--' + boundary
+
+ def __iter__(self):
+ boundarystream = InterBoundaryIter(self._stream, self._separator)
+ for sub_stream in boundarystream:
+ # Iterate over each part
+ yield parse_boundary_stream(sub_stream, 1024)
+
+def parse_header(line):
+ """ Parse the header into a key-value. """
+ plist = _parse_header_params(';' + line)
+ key = plist.pop(0).lower()
+ pdict = {}
+ for p in plist:
+ i = p.find('=')
+ if i >= 0:
+ name = p[:i].strip().lower()
+ value = p[i+1:].strip()
+ if len(value) >= 2 and value[0] == value[-1] == '"':
+ value = value[1:-1]
+ value = value.replace('\\\\', '\\').replace('\\"', '"')
+ pdict[name] = value
+ return key, pdict
+
+def _parse_header_params(s):
+ plist = []
+ while s[:1] == ';':
+ s = s[1:]
+ end = s.find(';')
+ while end > 0 and s.count('"', 0, end) % 2:
+ end = s.find(';', end + 1)
+ if end < 0:
+ end = len(s)
+ f = s[:end]
+ plist.append(f.strip())
+ s = s[end:]
+ return plist