diff --git a/doc/design-thoughts/buffer-redesign.txt b/doc/design-thoughts/buffer-redesign.txt new file mode 100644 index 000000000..c7d4345e7 --- /dev/null +++ b/doc/design-thoughts/buffer-redesign.txt @@ -0,0 +1,129 @@ +2012/02/27 - redesigning buffers for better simplicity - w@1wt.eu + +1) Analysis +----------- + +Buffer handling becomes complex because buffers are circular but many of their +users don't support wrapping operations (eg: HTTP parsing). Due to this fact, +some buffer operations automatically realign buffers as soon as possible when +the buffer is empty, which makes it very hard to track buffer pointers outside +of the buffer struct itself. The buffer contains a pointer to last processed +data (buf->lr) which is automatically realigned with such operations. But in +the end, its semantics are often unclear and whether it's safe or not to use it +isn't always obvious, as it has acquired multiple roles over the time. + +A "struct buffer" is declared this way : + + struct buffer { + unsigned int flags; /* BF_* */ + int rex; /* expiration date for a read, in ticks */ + int wex; /* expiration date for a write or connect, in ticks */ + int rto; /* read timeout, in ticks */ + int wto; /* write timeout, in ticks */ + unsigned int l; /* data length */ + char *r, *w, *lr; /* read ptr, write ptr, last read */ + unsigned int size; /* buffer size in bytes */ + unsigned int send_max; /* number of bytes the sender can consume om this buffer, <= l */ + unsigned int to_forward; /* number of bytes to forward after send_max without a wake-up */ + unsigned int analysers; /* bit field indicating what to do on the buffer */ + int analyse_exp; /* expiration date for current analysers (if set) */ + void (*hijacker)(struct session *, struct buffer *); /* alternative content producer */ + unsigned char xfer_large; /* number of consecutive large xfers */ + unsigned char xfer_small; /* number of consecutive small xfers */ + unsigned long long total; /* total data read */ + struct stream_interface *prod; /* producer attached to this buffer */ + struct stream_interface *cons; /* consumer attached to this buffer */ + struct pipe *pipe; /* non-NULL only when data present */ + char data[0]; /* bytes */ + }; + +In order to address this, a struct http_msg was created with other pointers to +the buffer. The issue is that some of these pointers are absolute and other +ones are relative, sometimes one to another, sometimes to the beginning of the +buffer, which doesn't help at all for the case where buffers get realigned. + +A "struct http_msg" is defined this way : + + struct http_msg { + unsigned int msg_state; + unsigned int flags; + unsigned int col, sov; /* current header: colon, start of value */ + unsigned int eoh; /* End Of Headers, relative to buffer */ + char *sol; /* start of line, also start of message when fully parsed */ + char *eol; /* end of line */ + unsigned int som; /* Start Of Message, relative to buffer */ + int err_pos; /* err handling: -2=block, -1=pass, 0+=detected */ + union { /* useful start line pointers, relative to ->sol */ + struct { + int l; /* request line length (not including CR) */ + int m_l; /* METHOD length (method starts at ->som) */ + int u, u_l; /* URI, length */ + int v, v_l; /* VERSION, length */ + } rq; /* request line : field, length */ + struct { + int l; /* status line length (not including CR) */ + int v_l; /* VERSION length (version starts at ->som) */ + int c, c_l; /* CODE, length */ + int r, r_l; /* REASON, length */ + } st; /* status line : field, length */ + } sl; /* start line */ + unsigned long long chunk_len; + unsigned long long body_len; + char **cap; + }; + + +The first immediate observation is that nothing in a buffer should be relative +to the beginning of the storage area, everything should be relative to the +buffer's origin as a floating location. Right now the buffer's origin is equal +to (buf->w + buf->send_max). It is the place where the first byte of data not +yet scheduled for being forwarded is found. + + - buf->w is an absolute pointer, just like buf->data. + - buf->send_max is a relative value which oscillates between 0 when nothing + has to be forwarded, and buf->l when the whole buffer must be forwarded. + + +2) Proposal +----------- + +By having such an origin, we could have everything in http_msg relative to this +origin. This would resist buffer realigns much better than right now. + +At the moment we have msg->som which is relative to buf->data and which points +to the beginning of the message. The beginning of the message should *always* +be the buffer's origin. If data are to be skipped in the message, just wait for +send_max to become zero and move the origin forwards ; this would definitely get +rid of msg->som. This is already what is done in the HTTP parser except that it +has to move both buf->lr and msg->som. + +Following the same principle, we should then have a relative pointer in +http_msg to replace buf->lr. It would be relative to the buffer's origin and +would simply recall what location was last visited. + +Doing all this could result in more complex operations where more time is spent +adding buf->w to buf->send_max and then to msg->anything. It would probably make +more sense to define the buffer's origin as an absolute pointer and to have +both the buf->h (head) and buf->t (tail) pointers be positive and negative +positions relative to this origin. Operating on the buffer would then look like +this : + + - no buf->l anymore. buf->l is replaced by (head + tail) + - no buf->lr anymore. Use origin + msg->last for instance + - recv() : head += recv(origin + head); + - send() : tail -= send(origin - tail, tail); + thus, buf->o effectively replaces buf->send_max. + - forward(N) : tail += N; origin += N; + - realign() : origin = data + - detect risk of wrapping of input : origin + head > data + size + +In general it looks like less pointers are manipulated for common operations +and that maybe an additional wrapping test (hand-made modulo) will have to be +added so send() and recv() operations. + + +3) Caveats +---------- + +The first caveat is that the elements to modify appear at a very large number +of places. diff --git a/doc/internals/buffer-operations.txt b/doc/internals/buffer-operations.txt new file mode 100644 index 000000000..1692f2115 --- /dev/null +++ b/doc/internals/buffer-operations.txt @@ -0,0 +1,128 @@ +2012/02/27 - Operations on haproxy buffers - w@1wt.eu + + +1) Definitions +-------------- + +A buffer is a unidirectional storage between two stream interfaces which are +most often composed of a socket file descriptor. This storage is fixed sized +and circular, which means that once data reach the end of the buffer, it loops +back at the beginning of the buffer : + + + Representation of a non-wrapping buffer + --------------------------------------- + + + beginning end + | -------- length --------> | + V V + +-------------------------------------------+ + | <--------------- size ----------------> | + +-------------------------------------------+ + + + Representation of a wrapping buffer + ----------------------------------- + + end beginning + +------> | | -------------+ + | V V | + | +-------------------------------------------+ | + | | <--------------- size ----------------> | | + | +-------------------------------------------+ | + | | + +--------------------- length -----------------------+ + + +Buffers are read by two entities : + - stream interfaces + - analysers + +Buffers are filled by two entities : + - stream interfaces + - hijackers + +A stream interface writes at the input of a buffer and reads at its output. An +analyser has to parse incoming buffer contents, so it reads the input. It does +not really write the output though it may change the buffer's contents at the +input, possibly causing data moves. A hijacker it able to write at the output +of a buffer. Hijackers are not used anymore at the moment though error outputs +still work the same way. + +Buffers are referenced in the session. Each session has two buffers which +interconnect the two stream interfaces. One buffer is called the request +buffer, it sees traffic flowing from the client to the server. The other buffer +is the response buffer, it sees traffic flowing from the server to the client. + +By convention, sessions are represented as 2 buffers one on top of the other, +and with 2 stream interfaces connected to the two buffers. The client connects +to the left stream interface (which then acts as a server), and the right +stream interface (which acts as a client) connects to the server. The data +circulate clockwise, so the upper buffer is the request buffer and the lower +buffer is the response buffer : + + ,------------------------. + ,-----> | request buffer | ------. + from ,--./ `------------------------' \,--. to + client ( ) ( ) server + `--' ,------------------------. /`--' + ^------- | response buffer | <-----' + `------------------------' + +2) Operations +------------- + +Socket-based stream interfaces write to buffers directly from the I/O layer +without relying on any specific function. + +Function-based stream interfaces do use a number of non-uniform functions to +read from the buffer's output and to write to the buffer's input. More suited +names could be : + + int buffer_output_peek_at(buf, ofs, ptr, size); + int buffer_output_peek(buf, ptr, size); + int buffer_output_read(buf, ptr, size); + int buffer_output_skip(buf, size); + int buffer_input_write(buf, ptr, size); + +Right now some stream interfaces use the following functions which also happen +to automatically schedule the response for automatic forward : + + buffer_put_block() [peers] + buffer_put_chunk() -> buffer_put_block() + buffer_feed_chunk() -> buffer_put_chunk() -> buffer_put_block() [dumpstats] + buffer_feed() -> buffer_put_string() -> buffer_put_block() [dumpstats] + + +The following stream-interface oriented functions are not used : + + buffer_get_char() + buffer_write_chunk() + + +Analysers read data from the buffers' input, and may sometimes write data +there too (or trim data). More suited names could be : + + int buffer_input_peek_at(buf, ofs, ptr, size); + int buffer_input_truncate_at(buf, ofs); + int buffer_input_peek(buf, ptr, size); + int buffer_input_read(buf, ptr, size); + int buffer_input_skip(buf, size); + int buffer_input_cut(buf, size); + int buffer_input_truncate(buf); + + +Functions that are available and need to be renamed : + - buffer_skip : buffer_output_skip + - buffer_ignore : buffer_input_skip ? => not exactly, more like + buffer_output_skip() without affecting sendmax ! + - buffer_cut_tail : deletes all pending data after sendmax. + -> buffer_input_truncate(). Used by si_retnclose() only. + - buffer_contig_data : buffer_output_contig_data + - buffer_pending : buffer_input_pending_data + - buffer_contig_space : buffer_input_contig_space + + +It looks like buf->lr could be removed and be stored in the HTTP message struct +since it's only used at the HTTP level. diff --git a/doc/internals/buffer-ops.fig b/doc/internals/buffer-ops.fig new file mode 100644 index 000000000..5cca1b606 --- /dev/null +++ b/doc/internals/buffer-ops.fig @@ -0,0 +1,152 @@ +#FIG 3.2 +Portrait +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3133.105 2868.088 2385 3465 3150 3825 3915 3420 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 3134.312 2832.717 2340 3420 3150 1845 3960 3375 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 +5 1 1 1 0 7 50 -1 -1 3.000 0 0 0 0 3150.000 2848.393 2115 3510 3150 1620 4185 3510 +5 1 0 1 0 7 50 -1 -1 0.000 0 1 1 1 3133.105 6423.088 2385 7020 3150 7380 3915 6975 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 +5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 1 3134.312 6387.717 2340 6975 3150 5400 3960 6930 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 +5 1 1 1 0 7 50 -1 -1 3.000 0 0 0 0 3150.000 6403.393 2115 7065 3150 5175 4185 7065 +1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3150 2835 1126 1126 3150 2835 3195 3960 +1 3 0 1 0 7 51 -1 -1 0.000 1 0.0000 3150 2835 1350 1350 3150 2835 4500 2835 +1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3150 6390 1126 1126 3150 6390 3195 7515 +1 3 0 1 0 7 51 -1 -1 0.000 1 0.0000 3150 6390 1350 1350 3150 6390 4500 6390 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3150 3960 3150 4185 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4050 3510 4230 3690 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2250 3510 2070 3690 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 4410 3285 4455 3150 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 4500 2655 4455 2475 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 4185 1980 4050 1845 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 3645 1575 3510 1530 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 2295 1800 2160 1890 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1980 2160 1935 2250 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1800 2655 1800 2790 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1800 3105 1845 3240 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 2877 1519 2697 1564 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3150 7515 3150 7740 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 4050 7065 4230 7245 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 2250 7065 2070 7245 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 4410 6840 4455 6705 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 4500 6210 4455 6030 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 4185 5535 4050 5400 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 3645 5130 3510 5085 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 2295 5355 2160 5445 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1980 5715 1935 5805 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1800 6210 1800 6345 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1800 6660 1845 6795 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 2877 5074 2697 5119 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 4 + 1 1 1.00 60.00 120.00 + 4950 3510 4635 3690 4545 3375 4185 3600 + 0.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 4 + 1 1 1.00 60.00 120.00 + 2115 3600 1800 3330 1305 3285 1260 3780 + 0.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 3 + 1 1 1.00 60.00 120.00 + 4635 2205 4545 1890 4185 2115 + 0.000 1.000 0.000 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 4 + 1 1 1.00 60.00 120.00 + 4950 7065 4635 7245 4545 6930 4185 7155 + 0.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 4 + 1 1 1.00 60.00 120.00 + 2115 7155 1800 6885 1305 6840 1260 7335 + 0.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 -1 -1 0.000 0 1 0 3 + 1 1 1.00 60.00 120.00 + 4635 5760 4545 5445 4185 5670 + 0.000 1.000 0.000 +4 1 0 50 -1 12 8 0.0000 4 75 75 3150 3690 l\001 +4 1 0 50 -1 12 8 0.0000 4 75 450 3150 2025 size-l\001 +4 1 0 50 -1 12 8 5.4978 4 45 75 1935 3780 w\001 +4 1 0 50 -1 12 8 0.7854 4 45 75 4365 3825 r\001 +4 1 0 50 -1 12 8 0.0000 4 90 300 3150 4365 (lr)\001 +4 1 0 50 -1 14 10 5.7596 4 90 270 2520 3960 OUT\001 +4 1 0 50 -1 12 8 5.7596 4 75 525 2430 4140 sendmax\001 +4 1 0 50 -1 14 10 0.5236 4 90 180 3690 4005 IN\001 +4 1 0 50 -1 12 8 0.5236 4 75 675 3870 4185 l-sendmax\001 +4 0 0 50 -1 12 8 0.0000 4 90 750 4545 2340 free space\001 +4 0 0 50 -1 12 8 0.0000 4 90 975 4950 3555 [eg: recv()]\001 +4 1 0 50 -1 12 8 0.0000 4 90 900 1260 4095 [eg: send()]\001 +4 1 0 50 -1 16 12 0.0000 4 165 2370 3150 855 Principle of the circular buffer\001 +4 1 0 50 -1 12 8 0.0000 4 90 600 1260 3960 buffer_*\001 +4 0 0 50 -1 12 8 0.0000 4 90 600 4950 3420 buffer_*\001 +4 1 0 50 -1 16 12 0.0000 4 165 1605 3150 1125 Current (since v1.3)\001 +4 0 0 50 -1 12 8 0.0000 4 90 1050 4950 6975 buffer_input_*\001 +4 1 0 50 -1 14 10 5.7596 4 90 270 2520 7515 OUT\001 +4 1 0 50 -1 14 10 0.5236 4 90 180 3690 7560 IN\001 +4 1 0 50 -1 12 8 0.0000 4 90 1125 1260 7515 buffer_output_*\001 +4 0 0 50 -1 12 8 0.0000 4 90 750 4545 5895 free space\001 +4 0 0 50 -1 12 8 0.0000 4 90 975 4950 7110 [eg: recv()]\001 +4 1 0 50 -1 12 8 0.0000 4 90 900 1260 7650 [eg: send()]\001 +4 0 0 50 -1 0 10 0.0000 4 135 1860 6075 1755 Some http_msg fields point to\001 +4 0 0 50 -1 0 10 0.0000 4 120 2175 6075 1950 absolute locations within the buffer,\001 +4 0 0 50 -1 0 10 0.0000 4 135 2040 6075 2145 making realignments quite tricky.\001 +4 0 0 50 -1 0 10 0.0000 4 135 1890 6075 5400 http_msg owns a pointer to the\001 +4 0 0 50 -1 0 10 0.0000 4 135 2055 6075 5595 struct_buffer and only uses offsets\001 +4 0 0 50 -1 0 10 0.0000 4 135 1095 6075 5790 relative to buf->p.\001 +4 1 0 50 -1 12 8 0.0000 4 75 600 3150 5760 size-i-o\001 +4 1 0 50 -1 12 8 0.0000 4 75 225 3150 7200 o+i\001 +4 1 0 50 -1 12 8 5.7596 4 45 75 2430 7695 o\001 +4 1 0 50 -1 12 8 0.5236 4 75 75 3870 7740 i\001 +4 1 0 50 -1 16 12 0.0000 4 180 1965 3150 4905 New design (1.5-dev9+)\001 +4 1 0 50 -1 12 8 0.0000 4 60 75 3150 7920 p\001