multipart.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. from __future__ import annotations
  2. import re
  3. import typing as t
  4. from dataclasses import dataclass
  5. from enum import auto
  6. from enum import Enum
  7. from ..datastructures import Headers
  8. from ..exceptions import RequestEntityTooLarge
  9. from ..http import parse_options_header
  10. class Event:
  11. pass
  12. @dataclass(frozen=True)
  13. class Preamble(Event):
  14. data: bytes
  15. @dataclass(frozen=True)
  16. class Field(Event):
  17. name: str
  18. headers: Headers
  19. @dataclass(frozen=True)
  20. class File(Event):
  21. name: str
  22. filename: str
  23. headers: Headers
  24. @dataclass(frozen=True)
  25. class Data(Event):
  26. data: bytes
  27. more_data: bool
  28. @dataclass(frozen=True)
  29. class Epilogue(Event):
  30. data: bytes
  31. class NeedData(Event):
  32. pass
  33. NEED_DATA = NeedData()
  34. class State(Enum):
  35. PREAMBLE = auto()
  36. PART = auto()
  37. DATA = auto()
  38. DATA_START = auto()
  39. EPILOGUE = auto()
  40. COMPLETE = auto()
  41. # Multipart line breaks MUST be CRLF (\r\n) by RFC-7578, except that
  42. # many implementations break this and either use CR or LF alone.
  43. LINE_BREAK = b"(?:\r\n|\n|\r)"
  44. BLANK_LINE_RE = re.compile(b"(?:\r\n\r\n|\r\r|\n\n)", re.MULTILINE)
  45. LINE_BREAK_RE = re.compile(LINE_BREAK, re.MULTILINE)
  46. # Header values can be continued via a space or tab after the linebreak, as
  47. # per RFC2231
  48. HEADER_CONTINUATION_RE = re.compile(b"%s[ \t]" % LINE_BREAK, re.MULTILINE)
  49. # This must be long enough to contain any line breaks plus any
  50. # additional boundary markers (--) such that they will be found in a
  51. # subsequent search
  52. SEARCH_EXTRA_LENGTH = 8
  53. class MultipartDecoder:
  54. """Decodes a multipart message as bytes into Python events.
  55. The part data is returned as available to allow the caller to save
  56. the data from memory to disk, if desired.
  57. """
  58. def __init__(
  59. self,
  60. boundary: bytes,
  61. max_form_memory_size: int | None = None,
  62. *,
  63. max_parts: int | None = None,
  64. ) -> None:
  65. self.buffer = bytearray()
  66. self.complete = False
  67. self.max_form_memory_size = max_form_memory_size
  68. self.max_parts = max_parts
  69. self.state = State.PREAMBLE
  70. self.boundary = boundary
  71. # Note in the below \h i.e. horizontal whitespace is used
  72. # as [^\S\n\r] as \h isn't supported in python.
  73. # The preamble must end with a boundary where the boundary is
  74. # prefixed by a line break, RFC2046. Except that many
  75. # implementations including Werkzeug's tests omit the line
  76. # break prefix. In addition the first boundary could be the
  77. # epilogue boundary (for empty form-data) hence the matching
  78. # group to understand if it is an epilogue boundary.
  79. self.preamble_re = re.compile(
  80. rb"%s?--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  81. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  82. re.MULTILINE,
  83. )
  84. # A boundary must include a line break prefix and suffix, and
  85. # may include trailing whitespace. In addition the boundary
  86. # could be the epilogue boundary hence the matching group to
  87. # understand if it is an epilogue boundary.
  88. self.boundary_re = re.compile(
  89. rb"%s--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  90. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  91. re.MULTILINE,
  92. )
  93. self._search_position = 0
  94. self._parts_decoded = 0
  95. def last_newline(self, data: bytes) -> int:
  96. try:
  97. last_nl = data.rindex(b"\n")
  98. except ValueError:
  99. last_nl = len(data)
  100. try:
  101. last_cr = data.rindex(b"\r")
  102. except ValueError:
  103. last_cr = len(data)
  104. return min(last_nl, last_cr)
  105. def receive_data(self, data: bytes | None) -> None:
  106. if data is None:
  107. self.complete = True
  108. elif (
  109. self.max_form_memory_size is not None
  110. and len(self.buffer) + len(data) > self.max_form_memory_size
  111. ):
  112. # Ensure that data within single event does not exceed limit.
  113. # Also checked across accumulated events in MultiPartParser.
  114. raise RequestEntityTooLarge()
  115. else:
  116. self.buffer.extend(data)
  117. def next_event(self) -> Event:
  118. event: Event = NEED_DATA
  119. if self.state == State.PREAMBLE:
  120. match = self.preamble_re.search(self.buffer, self._search_position)
  121. if match is not None:
  122. if match.group(1).startswith(b"--"):
  123. self.state = State.EPILOGUE
  124. else:
  125. self.state = State.PART
  126. data = bytes(self.buffer[: match.start()])
  127. del self.buffer[: match.end()]
  128. event = Preamble(data=data)
  129. self._search_position = 0
  130. else:
  131. # Update the search start position to be equal to the
  132. # current buffer length (already searched) minus a
  133. # safe buffer for part of the search target.
  134. self._search_position = max(
  135. 0, len(self.buffer) - len(self.boundary) - SEARCH_EXTRA_LENGTH
  136. )
  137. elif self.state == State.PART:
  138. match = BLANK_LINE_RE.search(self.buffer, self._search_position)
  139. if match is not None:
  140. headers = self._parse_headers(self.buffer[: match.start()])
  141. # The final header ends with a single CRLF, however a
  142. # blank line indicates the start of the
  143. # body. Therefore the end is after the first CRLF.
  144. headers_end = (match.start() + match.end()) // 2
  145. del self.buffer[:headers_end]
  146. if "content-disposition" not in headers:
  147. raise ValueError("Missing Content-Disposition header")
  148. disposition, extra = parse_options_header(
  149. headers["content-disposition"]
  150. )
  151. name = t.cast(str, extra.get("name"))
  152. filename = extra.get("filename")
  153. if filename is not None:
  154. event = File(
  155. filename=filename,
  156. headers=headers,
  157. name=name,
  158. )
  159. else:
  160. event = Field(
  161. headers=headers,
  162. name=name,
  163. )
  164. self.state = State.DATA_START
  165. self._search_position = 0
  166. self._parts_decoded += 1
  167. if self.max_parts is not None and self._parts_decoded > self.max_parts:
  168. raise RequestEntityTooLarge()
  169. else:
  170. # Update the search start position to be equal to the
  171. # current buffer length (already searched) minus a
  172. # safe buffer for part of the search target.
  173. self._search_position = max(0, len(self.buffer) - SEARCH_EXTRA_LENGTH)
  174. elif self.state == State.DATA_START:
  175. data, del_index, more_data = self._parse_data(self.buffer, start=True)
  176. del self.buffer[:del_index]
  177. event = Data(data=data, more_data=more_data)
  178. if more_data:
  179. self.state = State.DATA
  180. elif self.state == State.DATA:
  181. data, del_index, more_data = self._parse_data(self.buffer, start=False)
  182. del self.buffer[:del_index]
  183. if data or not more_data:
  184. event = Data(data=data, more_data=more_data)
  185. elif self.state == State.EPILOGUE and self.complete:
  186. event = Epilogue(data=bytes(self.buffer))
  187. del self.buffer[:]
  188. self.state = State.COMPLETE
  189. if self.complete and isinstance(event, NeedData):
  190. raise ValueError(f"Invalid form-data cannot parse beyond {self.state}")
  191. return event
  192. def _parse_headers(self, data: bytes) -> Headers:
  193. headers: list[tuple[str, str]] = []
  194. # Merge the continued headers into one line
  195. data = HEADER_CONTINUATION_RE.sub(b" ", data)
  196. # Now there is one header per line
  197. for line in data.splitlines():
  198. line = line.strip()
  199. if line != b"":
  200. name, _, value = line.decode().partition(":")
  201. headers.append((name.strip(), value.strip()))
  202. return Headers(headers)
  203. def _parse_data(self, data: bytes, *, start: bool) -> tuple[bytes, int, bool]:
  204. # Body parts must start with CRLF (or CR or LF)
  205. if start:
  206. match = LINE_BREAK_RE.match(data)
  207. data_start = t.cast(t.Match[bytes], match).end()
  208. else:
  209. data_start = 0
  210. boundary = b"--" + self.boundary
  211. if self.buffer.find(boundary) == -1:
  212. # No complete boundary in the buffer, but there may be
  213. # a partial boundary at the end. As the boundary
  214. # starts with either a nl or cr find the earliest and
  215. # return up to that as data.
  216. data_end = del_index = self.last_newline(data[data_start:]) + data_start
  217. # If amount of data after last newline is far from
  218. # possible length of partial boundary, we should
  219. # assume that there is no partial boundary in the buffer
  220. # and return all pending data.
  221. if (len(data) - data_end) > len(b"\n" + boundary):
  222. data_end = del_index = len(data)
  223. more_data = True
  224. else:
  225. match = self.boundary_re.search(data)
  226. if match is not None:
  227. if match.group(1).startswith(b"--"):
  228. self.state = State.EPILOGUE
  229. else:
  230. self.state = State.PART
  231. data_end = match.start()
  232. del_index = match.end()
  233. else:
  234. data_end = del_index = self.last_newline(data[data_start:]) + data_start
  235. more_data = match is None
  236. return bytes(data[data_start:data_end]), del_index, more_data
  237. class MultipartEncoder:
  238. def __init__(self, boundary: bytes) -> None:
  239. self.boundary = boundary
  240. self.state = State.PREAMBLE
  241. def send_event(self, event: Event) -> bytes:
  242. if isinstance(event, Preamble) and self.state == State.PREAMBLE:
  243. self.state = State.PART
  244. return event.data
  245. elif isinstance(event, (Field, File)) and self.state in {
  246. State.PREAMBLE,
  247. State.PART,
  248. State.DATA,
  249. }:
  250. data = b"\r\n--" + self.boundary + b"\r\n"
  251. data += b'Content-Disposition: form-data; name="%s"' % event.name.encode()
  252. if isinstance(event, File):
  253. data += b'; filename="%s"' % event.filename.encode()
  254. data += b"\r\n"
  255. for name, value in t.cast(Field, event).headers:
  256. if name.lower() != "content-disposition":
  257. data += f"{name}: {value}\r\n".encode()
  258. self.state = State.DATA_START
  259. return data
  260. elif isinstance(event, Data) and self.state == State.DATA_START:
  261. self.state = State.DATA
  262. if len(event.data) > 0:
  263. return b"\r\n" + event.data
  264. else:
  265. return event.data
  266. elif isinstance(event, Data) and self.state == State.DATA:
  267. return event.data
  268. elif isinstance(event, Epilogue):
  269. self.state = State.COMPLETE
  270. return b"\r\n--" + self.boundary + b"--\r\n" + event.data
  271. else:
  272. raise ValueError(f"Cannot generate {event} in state: {self.state}")