1/* $NetBSD: tcp_var.h,v 1.190 2018/12/27 16:59:17 maxv Exp $ */
2
3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34 *
35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met:
38 *
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL).
70 */
71
72/*-
73 * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
74 * All rights reserved.
75 *
76 * This code is derived from software contributed to The NetBSD Foundation
77 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
78 * NASA Ames Research Center.
79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Charles M. Hannum.
81 *
82 * Redistribution and use in source and binary forms, with or without
83 * modification, are permitted provided that the following conditions
84 * are met:
85 * 1. Redistributions of source code must retain the above copyright
86 * notice, this list of conditions and the following disclaimer.
87 * 2. Redistributions in binary form must reproduce the above copyright
88 * notice, this list of conditions and the following disclaimer in the
89 * documentation and/or other materials provided with the distribution.
90 *
91 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
92 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
93 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
95 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
96 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
97 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
98 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
99 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
100 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
101 * POSSIBILITY OF SUCH DAMAGE.
102 */
103
104/*
105 * Copyright (c) 1982, 1986, 1993, 1994, 1995
106 * The Regents of the University of California. All rights reserved.
107 *
108 * Redistribution and use in source and binary forms, with or without
109 * modification, are permitted provided that the following conditions
110 * are met:
111 * 1. Redistributions of source code must retain the above copyright
112 * notice, this list of conditions and the following disclaimer.
113 * 2. Redistributions in binary form must reproduce the above copyright
114 * notice, this list of conditions and the following disclaimer in the
115 * documentation and/or other materials provided with the distribution.
116 * 3. Neither the name of the University nor the names of its contributors
117 * may be used to endorse or promote products derived from this software
118 * without specific prior written permission.
119 *
120 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
122 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
123 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
124 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
125 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
126 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
127 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
128 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
129 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
130 * SUCH DAMAGE.
131 *
132 * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95
133 */
134
135#ifndef _NETINET_TCP_VAR_H_
136#define _NETINET_TCP_VAR_H_
137
138#if defined(_KERNEL_OPT)
139#include "opt_inet.h"
140#include "opt_mbuftrace.h"
141
142#endif
143
144/*
145 * TCP kernel structures and variables.
146 */
147
148#include <sys/callout.h>
149
150#ifdef TCP_SIGNATURE
151/*
152 * Defines which are needed by the xform_tcp module and tcp_[in|out]put
153 * for SADB verification and lookup.
154 */
155#define TCP_SIGLEN 16 /* length of computed digest in bytes */
156#define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */
157#define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */
158/*
159 * Only a single SA per host may be specified at this time. An SPI is
160 * needed in order for the KEY_LOOKUP_SA() lookup to work.
161 */
162#define TCP_SIG_SPI 0x1000
163#endif /* TCP_SIGNATURE */
164
165/*
166 * Tcp+ip header, after ip options removed.
167 */
168struct tcpiphdr {
169 struct ipovly ti_i; /* overlaid ip structure */
170 struct tcphdr ti_t; /* tcp header */
171} __packed;
172#define ti_x1 ti_i.ih_x1
173#define ti_pr ti_i.ih_pr
174#define ti_len ti_i.ih_len
175#define ti_src ti_i.ih_src
176#define ti_dst ti_i.ih_dst
177#define ti_sport ti_t.th_sport
178#define ti_dport ti_t.th_dport
179#define ti_seq ti_t.th_seq
180#define ti_ack ti_t.th_ack
181#define ti_x2 ti_t.th_x2
182#define ti_off ti_t.th_off
183#define ti_flags ti_t.th_flags
184#define ti_win ti_t.th_win
185#define ti_sum ti_t.th_sum
186#define ti_urp ti_t.th_urp
187
188/*
189 * SACK option block.
190 */
191struct sackblk {
192 tcp_seq left; /* Left edge of sack block. */
193 tcp_seq right; /* Right edge of sack block. */
194};
195
196TAILQ_HEAD(sackhead, sackhole);
197struct sackhole {
198 tcp_seq start;
199 tcp_seq end;
200 tcp_seq rxmit;
201
202 TAILQ_ENTRY(sackhole) sackhole_q;
203};
204
205/*
206 * Tcp control block, one per tcp; fields:
207 */
208struct tcpcb {
209 int t_family; /* address family on the wire */
210 struct ipqehead segq; /* sequencing queue */
211 int t_segqlen; /* length of the above */
212 callout_t t_timer[TCPT_NTIMERS];/* tcp timers */
213 short t_state; /* state of this connection */
214 short t_rxtshift; /* log(2) of rexmt exp. backoff */
215 uint32_t t_rxtcur; /* current retransmit value */
216 short t_dupacks; /* consecutive dup acks recd */
217 /*
218 * t_partialacks:
219 * <0 not in fast recovery.
220 * ==0 in fast recovery. has not received partial acks
221 * >0 in fast recovery. has received partial acks
222 */
223 short t_partialacks; /* partials acks during fast rexmit */
224 u_short t_peermss; /* peer's maximum segment size */
225 u_short t_ourmss; /* our's maximum segment size */
226 u_short t_segsz; /* current segment size in use */
227 char t_force; /* 1 if forcing out a byte */
228 u_int t_flags;
229#define TF_ACKNOW 0x0001 /* ack peer immediately */
230#define TF_DELACK 0x0002 /* ack, but try to delay it */
231#define TF_NODELAY 0x0004 /* don't delay packets to coalesce */
232#define TF_NOOPT 0x0008 /* don't use tcp options */
233#define TF_REQ_SCALE 0x0020 /* have/will request window scaling */
234#define TF_RCVD_SCALE 0x0040 /* other side has requested scaling */
235#define TF_REQ_TSTMP 0x0080 /* have/will request timestamps */
236#define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */
237#define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */
238#define TF_SYN_REXMT 0x0400 /* rexmit timer fired on SYN */
239#define TF_WILL_SACK 0x0800 /* try to use SACK */
240#define TF_REASSEMBLING 0x1000 /* we're busy reassembling */
241#define TF_DEAD 0x2000 /* dead and to-be-released */
242#define TF_PMTUD_PEND 0x4000 /* Path MTU Discovery pending */
243#define TF_ECN_PERMIT 0x10000 /* other side said is ECN-ready */
244#define TF_ECN_SND_CWR 0x20000 /* ECN CWR in queue */
245#define TF_ECN_SND_ECE 0x40000 /* ECN ECE in queue */
246#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */
247
248
249 struct mbuf *t_template; /* skeletal packet for transmit */
250 struct inpcb *t_inpcb; /* back pointer to internet pcb */
251 struct in6pcb *t_in6pcb; /* back pointer to internet pcb */
252 callout_t t_delack_ch; /* delayed ACK callout */
253/*
254 * The following fields are used as in the protocol specification.
255 * See RFC793, Dec. 1981, page 21.
256 */
257/* send sequence variables */
258 tcp_seq snd_una; /* send unacknowledged */
259 tcp_seq snd_nxt; /* send next */
260 tcp_seq snd_up; /* send urgent pointer */
261 tcp_seq snd_wl1; /* window update seg seq number */
262 tcp_seq snd_wl2; /* window update seg ack number */
263 tcp_seq iss; /* initial send sequence number */
264 u_long snd_wnd; /* send window */
265/*
266 * snd_recover
267 * it's basically same as the "recover" variable in RFC 2852 (NewReno).
268 * when entering fast retransmit, it's set to snd_max.
269 * newreno uses this to detect partial ack.
270 * snd_high
271 * it's basically same as the "send_high" variable in RFC 2852 (NewReno).
272 * on each RTO, it's set to snd_max.
273 * newreno uses this to avoid false fast retransmits.
274 */
275 tcp_seq snd_recover;
276 tcp_seq snd_high;
277/* receive sequence variables */
278 u_long rcv_wnd; /* receive window */
279 tcp_seq rcv_nxt; /* receive next */
280 tcp_seq rcv_up; /* receive urgent pointer */
281 tcp_seq irs; /* initial receive sequence number */
282/*
283 * Additional variables for this implementation.
284 */
285/* receive variables */
286 tcp_seq rcv_adv; /* advertised window */
287
288/*
289 * retransmit variables
290 *
291 * snd_max
292 * the highest sequence number we've ever sent.
293 * used to recognize retransmits.
294 */
295 tcp_seq snd_max;
296
297/* congestion control (for slow start, source quench, retransmit after loss) */
298 u_long snd_cwnd; /* congestion-controlled window */
299 u_long snd_ssthresh; /* snd_cwnd size threshhold for
300 * for slow start exponential to
301 * linear switch
302 */
303/* auto-sizing variables */
304 u_int rfbuf_cnt; /* recv buffer autoscaling byte count */
305 uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
306
307/*
308 * transmit timing stuff. See below for scale of srtt and rttvar.
309 * "Variance" is actually smoothed difference.
310 */
311 uint32_t t_rcvtime; /* time last segment received */
312 uint32_t t_rtttime; /* time we started measuring rtt */
313 tcp_seq t_rtseq; /* sequence number being timed */
314 int32_t t_srtt; /* smoothed round-trip time */
315 int32_t t_rttvar; /* variance in round-trip time */
316 uint32_t t_rttmin; /* minimum rtt allowed */
317 u_long max_sndwnd; /* largest window peer has offered */
318
319/* out-of-band data */
320 char t_oobflags; /* have some */
321 char t_iobc; /* input character */
322#define TCPOOB_HAVEDATA 0x01
323#define TCPOOB_HADDATA 0x02
324 short t_softerror; /* possible error not yet reported */
325
326/* RFC 1323 variables */
327 u_char snd_scale; /* window scaling for send window */
328 u_char rcv_scale; /* window scaling for recv window */
329 u_char request_r_scale; /* pending window scaling */
330 u_char requested_s_scale;
331 u_int32_t ts_recent; /* timestamp echo data */
332 u_int32_t ts_recent_age; /* when last updated */
333 u_int32_t ts_timebase; /* our timebase */
334 tcp_seq last_ack_sent;
335
336/* RFC 3465 variables */
337 u_long t_bytes_acked; /* ABC "bytes_acked" parameter */
338
339/* SACK stuff */
340#define TCP_SACK_MAX 3
341#define TCPSACK_NONE 0
342#define TCPSACK_HAVED 1
343 u_char rcv_sack_flags; /* SACK flags. */
344 struct sackblk rcv_dsack_block; /* RX D-SACK block. */
345 struct ipqehead timeq; /* time sequenced queue. */
346 struct sackhead snd_holes; /* TX SACK holes. */
347 int snd_numholes; /* Number of TX SACK holes. */
348 tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
349 tcp_seq sack_newdata; /* New data xmitted in this recovery
350 episode starts at this seq number*/
351 tcp_seq snd_fack; /* FACK TCP. Forward-most data held by
352 peer. */
353
354/* CUBIC variables */
355 ulong snd_cubic_wmax; /* W_max */
356 ulong snd_cubic_wmax_last; /* Used for fast convergence */
357 ulong snd_cubic_ctime; /* Last congestion time */
358
359/* pointer for syn cache entries*/
360 LIST_HEAD(, syn_cache) t_sc; /* list of entries by this tcb */
361
362/* prediction of next mbuf when using large window sizes */
363 struct mbuf *t_lastm; /* last mbuf that data was sent from */
364 int t_inoff; /* data offset in previous mbuf */
365 int t_lastoff; /* last data address in mbuf chain */
366 int t_lastlen; /* last length read from mbuf chain */
367
368/* Path-MTU discovery blackhole detection */
369 int t_mtudisc; /* perform mtudisc for this tcb */
370/* Path-MTU Discovery Information */
371 u_int t_pmtud_mss_acked; /* MSS acked, lower bound for MTU */
372 u_int t_pmtud_mtu_sent; /* MTU used, upper bound for MTU */
373 tcp_seq t_pmtud_th_seq; /* TCP SEQ from ICMP payload */
374 u_int t_pmtud_nextmtu; /* Advertised Next-Hop MTU from ICMP */
375 u_short t_pmtud_ip_len; /* IP length from ICMP payload */
376 u_short t_pmtud_ip_hl; /* IP header length from ICMP payload */
377
378 uint8_t t_ecn_retries; /* # of ECN setup retries */
379
380 const struct tcp_congctl *t_congctl; /* per TCB congctl algorithm */
381
382 /* Keepalive per socket */
383 u_int t_keepinit;
384 u_int t_keepidle;
385 u_int t_keepintvl;
386 u_int t_keepcnt;
387 u_int t_maxidle; /* t_keepcnt * t_keepintvl */
388
389 u_int t_msl; /* MSL to use for this connexion */
390
391 /* maintain a few stats per connection: */
392 uint32_t t_rcvoopack; /* out-of-order packets received */
393 uint32_t t_sndrexmitpack; /* retransmit packets sent */
394 uint32_t t_sndzerowin; /* zero-window updates sent */
395};
396
397/*
398 * Macros to aid ECN TCP.
399 */
400#define TCP_ECN_ALLOWED(tp) (tp->t_flags & TF_ECN_PERMIT)
401
402/*
403 * Macros to aid SACK/FACK TCP.
404 */
405#define TCP_SACK_ENABLED(tp) (tp->t_flags & TF_WILL_SACK)
406#define TCP_FACK_FASTRECOV(tp) \
407 (TCP_SACK_ENABLED(tp) && \
408 (SEQ_GT(tp->snd_fack, tp->snd_una + tcprexmtthresh * tp->t_segsz)))
409
410#ifdef _KERNEL
411/*
412 * TCP reassembly queue locks.
413 */
414static __inline int tcp_reass_lock_try (struct tcpcb *)
415 __unused;
416static __inline void tcp_reass_unlock (struct tcpcb *)
417 __unused;
418
419static __inline int
420tcp_reass_lock_try(struct tcpcb *tp)
421{
422 int s;
423
424 /*
425 * Use splvm() -- we're blocking things that would cause
426 * mbuf allocation.
427 */
428 s = splvm();
429 if (tp->t_flags & TF_REASSEMBLING) {
430 splx(s);
431 return (0);
432 }
433 tp->t_flags |= TF_REASSEMBLING;
434 splx(s);
435 return (1);
436}
437
438static __inline void
439tcp_reass_unlock(struct tcpcb *tp)
440{
441 int s;
442
443 s = splvm();
444 KASSERT((tp->t_flags & TF_REASSEMBLING) != 0);
445 tp->t_flags &= ~TF_REASSEMBLING;
446 splx(s);
447}
448
449#ifdef DIAGNOSTIC
450#define TCP_REASS_LOCK(tp) \
451do { \
452 if (tcp_reass_lock_try(tp) == 0) { \
453 printf("%s:%d: tcpcb %p reass already locked\n", \
454 __FILE__, __LINE__, tp); \
455 panic("tcp_reass_lock"); \
456 } \
457} while (/*CONSTCOND*/ 0)
458#define TCP_REASS_LOCK_CHECK(tp) \
459do { \
460 if (((tp)->t_flags & TF_REASSEMBLING) == 0) { \
461 printf("%s:%d: tcpcb %p reass lock not held\n", \
462 __FILE__, __LINE__, tp); \
463 panic("tcp reass lock check"); \
464 } \
465} while (/*CONSTCOND*/ 0)
466#else
467#define TCP_REASS_LOCK(tp) (void) tcp_reass_lock_try((tp))
468#define TCP_REASS_LOCK_CHECK(tp) /* nothing */
469#endif
470
471#define TCP_REASS_UNLOCK(tp) tcp_reass_unlock((tp))
472#endif /* _KERNEL */
473
474/*
475 * Queue for delayed ACK processing.
476 */
477#ifdef _KERNEL
478extern int tcp_delack_ticks;
479void tcp_delack(void *);
480
481#define TCP_RESTART_DELACK(tp) \
482 callout_reset(&(tp)->t_delack_ch, tcp_delack_ticks, \
483 tcp_delack, tp)
484
485#define TCP_SET_DELACK(tp) \
486do { \
487 if (((tp)->t_flags & TF_DELACK) == 0) { \
488 (tp)->t_flags |= TF_DELACK; \
489 TCP_RESTART_DELACK(tp); \
490 } \
491} while (/*CONSTCOND*/0)
492
493#define TCP_CLEAR_DELACK(tp) \
494do { \
495 if ((tp)->t_flags & TF_DELACK) { \
496 (tp)->t_flags &= ~TF_DELACK; \
497 callout_stop(&(tp)->t_delack_ch); \
498 } \
499} while (/*CONSTCOND*/0)
500#endif /* _KERNEL */
501
502/*
503 * Compute the current timestamp for a connection.
504 */
505#define TCP_TIMESTAMP(tp) (tcp_now - (tp)->ts_timebase)
506
507/*
508 * Handy way of passing around TCP option info.
509 */
510struct tcp_opt_info {
511 int ts_present;
512 u_int32_t ts_val;
513 u_int32_t ts_ecr;
514 u_int16_t maxseg;
515};
516
517#define TOF_SIGNATURE 0x0040 /* signature option present */
518#define TOF_SIGLEN 0x0080 /* sigature length valid (RFC2385) */
519
520/*
521 * Data for the TCP compressed state engine.
522 */
523union syn_cache_sa {
524 struct sockaddr sa;
525 struct sockaddr_in sin;
526#if 1 /*def INET6*/
527 struct sockaddr_in6 sin6;
528#endif
529};
530
531struct syn_cache {
532 TAILQ_ENTRY(syn_cache) sc_bucketq; /* link on bucket list */
533 callout_t sc_timer; /* rexmt timer */
534 struct route sc_route;
535 long sc_win; /* advertised window */
536 int sc_bucketidx; /* our bucket index */
537 u_int32_t sc_hash;
538 u_int32_t sc_timestamp; /* timestamp from SYN */
539 u_int32_t sc_timebase; /* our local timebase */
540 union syn_cache_sa sc_src;
541 union syn_cache_sa sc_dst;
542 tcp_seq sc_irs;
543 tcp_seq sc_iss;
544 u_int sc_rxtcur; /* current rxt timeout */
545 u_int sc_rxttot; /* total time spend on queues */
546 u_short sc_rxtshift; /* for computing backoff */
547 u_short sc_flags;
548
549#define SCF_UNREACH 0x0001 /* we've had an unreach error */
550#define SCF_TIMESTAMP 0x0002 /* peer will do timestamps */
551#define SCF_DEAD 0x0004 /* this entry to be released */
552#define SCF_SACK_PERMIT 0x0008 /* peer will do SACK */
553#define SCF_ECN_PERMIT 0x0010 /* peer will do ECN */
554#define SCF_SIGNATURE 0x40 /* send MD5 digests */
555
556 struct mbuf *sc_ipopts; /* IP options */
557 u_int16_t sc_peermaxseg;
558 u_int16_t sc_ourmaxseg;
559 u_int8_t sc_request_r_scale : 4,
560 sc_requested_s_scale : 4;
561
562 struct tcpcb *sc_tp; /* tcb for listening socket */
563 LIST_ENTRY(syn_cache) sc_tpq; /* list of entries by same tp */
564};
565
566struct syn_cache_head {
567 TAILQ_HEAD(, syn_cache) sch_bucket; /* bucket entries */
568 u_short sch_length; /* # entries in bucket */
569};
570
571#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
572#ifdef INET6
573#define in6totcpcb(ip) ((struct tcpcb *)(ip)->in6p_ppcb)
574#endif
575#ifndef INET6
576#define sototcpcb(so) (intotcpcb(sotoinpcb(so)))
577#else
578#define sototcpcb(so) (((so)->so_proto->pr_domain->dom_family == AF_INET) \
579 ? intotcpcb(sotoinpcb(so)) \
580 : in6totcpcb(sotoin6pcb(so)))
581#endif
582
583/*
584 * See RFC2988 for a discussion of RTO calculation; comments assume
585 * familiarity with that document.
586 *
587 * The smoothed round-trip time and estimated variance are stored as
588 * fixed point numbers. Historically, srtt was scaled by
589 * TCP_RTT_SHIFT bits, and rttvar by TCP_RTTVAR_SHIFT bits. Because
590 * the values coincide with the alpha and beta parameters suggested
591 * for RTO calculation (1/8 for srtt, 1/4 for rttvar), the combination
592 * of computing 1/8 of the new value and transforming it to the
593 * fixed-point representation required zero instructions. However,
594 * the storage representations no longer coincide with the alpha/beta
595 * shifts; instead, more fractional bits are present.
596 *
597 * The storage representation of srtt is 1/32 slow ticks, or 1/64 s.
598 * (The assumption that a slow tick is 500 ms should not be present in
599 * the code.)
600 *
601 * The storage representation of rttvar is 1/16 slow ticks, or 1/32 s.
602 * There may be some confusion about this in the code.
603 *
604 * For historical reasons, these scales are also used in smoothing the
605 * average (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
606 * This results in alpha of 0.125 and beta of 0.25, following RFC2988
607 * section 2.3
608 *
609 * XXX Change SHIFT values to LGWEIGHT and REP_SHIFT, and adjust
610 * the code to use the correct ones.
611 */
612#define TCP_RTT_SHIFT 3 /* shift for srtt; 3 bits frac. */
613#define TCP_RTTVAR_SHIFT 2 /* multiplier for rttvar; 2 bits */
614
615/*
616 * Compute TCP retransmission timer, following RFC2988.
617 * This macro returns a value in slow timeout ticks.
618 *
619 * Section 2.2 requires that the RTO value be
620 * srtt + max(G, 4*RTTVAR)
621 * where G is the clock granularity.
622 *
623 * This comment has not necessarily been updated for the new storage
624 * representation:
625 *
626 * Because of the way we do the smoothing, srtt and rttvar
627 * will each average +1/2 tick of bias. When we compute
628 * the retransmit timer, we want 1/2 tick of rounding and
629 * 1 extra tick because of +-1/2 tick uncertainty in the
630 * firing of the timer. The bias will give us exactly the
631 * 1.5 tick we need. But, because the bias is
632 * statistical, we have to test that we don't drop below
633 * the minimum feasible timer (which is 2 ticks).
634 * This macro assumes that the value of 1<<TCP_RTTVAR_SHIFT
635 * is the same as the multiplier for rttvar.
636 *
637 * This macro appears to be wrong; it should be checking rttvar*4 in
638 * ticks and making sure we use 1 instead if rttvar*4 rounds to 0. It
639 * appears to be treating srtt as being in the old storage
640 * representation, resulting in a factor of 4 extra.
641 */
642#define TCP_REXMTVAL(tp) \
643 ((((tp)->t_srtt >> TCP_RTT_SHIFT) + (tp)->t_rttvar) >> 2)
644
645/*
646 * Compute the initial window for slow start.
647 */
648#define TCP_INITIAL_WINDOW(iw, segsz) \
649 uimin((iw) * (segsz), uimax(2 * (segsz), tcp_init_win_max[(iw)]))
650
651/*
652 * TCP statistics.
653 * Each counter is an unsigned 64-bit value.
654 *
655 * Many of these should be kept per connection, but that's inconvenient
656 * at the moment.
657 */
658#define TCP_STAT_CONNATTEMPT 0 /* connections initiated */
659#define TCP_STAT_ACCEPTS 1 /* connections accepted */
660#define TCP_STAT_CONNECTS 2 /* connections established */
661#define TCP_STAT_DROPS 3 /* connections dropped */
662#define TCP_STAT_CONNDROPS 4 /* embryonic connections dropped */
663#define TCP_STAT_CLOSED 5 /* conn. closed (includes drops) */
664#define TCP_STAT_SEGSTIMED 6 /* segs where we tried to get rtt */
665#define TCP_STAT_RTTUPDATED 7 /* times we succeeded */
666#define TCP_STAT_DELACK 8 /* delayed ACKs sent */
667#define TCP_STAT_TIMEOUTDROP 9 /* conn. dropped in rxmt timeout */
668#define TCP_STAT_REXMTTIMEO 10 /* retransmit timeouts */
669#define TCP_STAT_PERSISTTIMEO 11 /* persist timeouts */
670#define TCP_STAT_KEEPTIMEO 12 /* keepalive timeouts */
671#define TCP_STAT_KEEPPROBE 13 /* keepalive probes sent */
672#define TCP_STAT_KEEPDROPS 14 /* connections dropped in keepalive */
673#define TCP_STAT_PERSISTDROPS 15 /* connections dropped in persist */
674#define TCP_STAT_CONNSDRAINED 16 /* connections drained due to memory
675 shortage */
676#define TCP_STAT_PMTUBLACKHOLE 17 /* PMTUD blackhole detected */
677#define TCP_STAT_SNDTOTAL 18 /* total packets sent */
678#define TCP_STAT_SNDPACK 19 /* data packlets sent */
679#define TCP_STAT_SNDBYTE 20 /* data bytes sent */
680#define TCP_STAT_SNDREXMITPACK 21 /* data packets retransmitted */
681#define TCP_STAT_SNDREXMITBYTE 22 /* data bytes retransmitted */
682#define TCP_STAT_SNDACKS 23 /* ACK-only packets sent */
683#define TCP_STAT_SNDPROBE 24 /* window probes sent */
684#define TCP_STAT_SNDURG 25 /* packets sent with URG only */
685#define TCP_STAT_SNDWINUP 26 /* window update-only packets sent */
686#define TCP_STAT_SNDCTRL 27 /* control (SYN|FIN|RST) packets sent */
687#define TCP_STAT_RCVTOTAL 28 /* total packets received */
688#define TCP_STAT_RCVPACK 29 /* packets received in sequence */
689#define TCP_STAT_RCVBYTE 30 /* bytes received in sequence */
690#define TCP_STAT_RCVBADSUM 31 /* packets received with cksum errs */
691#define TCP_STAT_RCVBADOFF 32 /* packets received with bad offset */
692#define TCP_STAT_RCVMEMDROP 33 /* packets dropped for lack of memory */
693#define TCP_STAT_RCVSHORT 34 /* packets received too short */
694#define TCP_STAT_RCVDUPPACK 35 /* duplicate-only packets received */
695#define TCP_STAT_RCVDUPBYTE 36 /* duplicate-only bytes received */
696#define TCP_STAT_RCVPARTDUPPACK 37 /* packets with some duplicate data */
697#define TCP_STAT_RCVPARTDUPBYTE 38 /* dup. bytes in part-dup. packets */
698#define TCP_STAT_RCVOOPACK 39 /* out-of-order packets received */
699#define TCP_STAT_RCVOOBYTE 40 /* out-of-order bytes received */
700#define TCP_STAT_RCVPACKAFTERWIN 41 /* packets with data after window */
701#define TCP_STAT_RCVBYTEAFTERWIN 42 /* bytes received after window */
702#define TCP_STAT_RCVAFTERCLOSE 43 /* packets received after "close" */
703#define TCP_STAT_RCVWINPROBE 44 /* rcvd window probe packets */
704#define TCP_STAT_RCVDUPACK 45 /* rcvd duplicate ACKs */
705#define TCP_STAT_RCVACKTOOMUCH 46 /* rcvd ACKs for unsent data */
706#define TCP_STAT_RCVACKPACK 47 /* rcvd ACK packets */
707#define TCP_STAT_RCVACKBYTE 48 /* bytes ACKed by rcvd ACKs */
708#define TCP_STAT_RCVWINUPD 49 /* rcvd window update packets */
709#define TCP_STAT_PAWSDROP 50 /* segments dropped due to PAWS */
710#define TCP_STAT_PREDACK 51 /* times hdr predict OK for ACKs */
711#define TCP_STAT_PREDDAT 52 /* times hdr predict OK for data pkts */
712#define TCP_STAT_PCBHASHMISS 53 /* input packets missing PCB hash */
713#define TCP_STAT_NOPORT 54 /* no socket on port */
714#define TCP_STAT_BADSYN 55 /* received ACK for which we have
715 no SYN in compressed state */
716#define TCP_STAT_DELAYED_FREE 56 /* delayed pool_put() of tcpcb */
717#define TCP_STAT_SC_ADDED 57 /* # of sc entries added */
718#define TCP_STAT_SC_COMPLETED 58 /* # of sc connections completed */
719#define TCP_STAT_SC_TIMED_OUT 59 /* # of sc entries timed out */
720#define TCP_STAT_SC_OVERFLOWED 60 /* # of sc drops due to overflow */
721#define TCP_STAT_SC_RESET 61 /* # of sc drops due to RST */
722#define TCP_STAT_SC_UNREACH 62 /* # of sc drops due to ICMP unreach */
723#define TCP_STAT_SC_BUCKETOVERFLOW 63 /* # of sc drops due to bucket ovflow */
724#define TCP_STAT_SC_ABORTED 64 /* # of sc entries aborted (no mem) */
725#define TCP_STAT_SC_DUPESYN 65 /* # of duplicate SYNs received */
726#define TCP_STAT_SC_DROPPED 66 /* # of SYNs dropped (no route/mem) */
727#define TCP_STAT_SC_COLLISIONS 67 /* # of sc hash collisions */
728#define TCP_STAT_SC_RETRANSMITTED 68 /* # of sc retransmissions */
729#define TCP_STAT_SC_DELAYED_FREE 69 /* # of delayed pool_put()s */
730#define TCP_STAT_SELFQUENCH 70 /* # of ENOBUFS we get on output */
731#define TCP_STAT_BADSIG 71 /* # of drops due to bad signature */
732#define TCP_STAT_GOODSIG 72 /* # of packets with good signature */
733#define TCP_STAT_ECN_SHS 73 /* # of successful ECN handshakes */
734#define TCP_STAT_ECN_CE 74 /* # of packets with CE bit */
735#define TCP_STAT_ECN_ECT 75 /* # of packets with ECT(0) bit */
736
737#define TCP_NSTATS 76
738
739/*
740 * Names for TCP sysctl objects.
741 */
742#define TCPCTL_RFC1323 1 /* RFC1323 timestamps/scaling */
743#define TCPCTL_SENDSPACE 2 /* default send buffer */
744#define TCPCTL_RECVSPACE 3 /* default recv buffer */
745#define TCPCTL_MSSDFLT 4 /* default seg size */
746#define TCPCTL_SYN_CACHE_LIMIT 5 /* max size of comp. state engine */
747#define TCPCTL_SYN_BUCKET_LIMIT 6 /* max size of hash bucket */
748#if 0 /*obsoleted*/
749#define TCPCTL_SYN_CACHE_INTER 7 /* interval of comp. state timer */
750#endif
751#define TCPCTL_INIT_WIN 8 /* initial window */
752#define TCPCTL_MSS_IFMTU 9 /* mss from interface, not in_maxmtu */
753#define TCPCTL_SACK 10 /* RFC2018 selective acknowledgement */
754#define TCPCTL_WSCALE 11 /* RFC1323 window scaling */
755#define TCPCTL_TSTAMP 12 /* RFC1323 timestamps */
756#if 0 /*obsoleted*/
757#define TCPCTL_COMPAT_42 13 /* 4.2BSD TCP bug work-arounds */
758#endif
759#define TCPCTL_CWM 14 /* Congestion Window Monitoring */
760#define TCPCTL_CWM_BURSTSIZE 15 /* burst size allowed by CWM */
761#define TCPCTL_ACK_ON_PUSH 16 /* ACK immediately on PUSH */
762#define TCPCTL_KEEPIDLE 17 /* keepalive idle time */
763#define TCPCTL_KEEPINTVL 18 /* keepalive probe interval */
764#define TCPCTL_KEEPCNT 19 /* keepalive count */
765#define TCPCTL_SLOWHZ 20 /* PR_SLOWHZ (read-only) */
766#define TCPCTL_NEWRENO 21 /* NewReno Congestion Control */
767#define TCPCTL_LOG_REFUSED 22 /* Log refused connections */
768#if 0 /*obsoleted*/
769#define TCPCTL_RSTRATELIMIT 23 /* RST rate limit */
770#endif
771#define TCPCTL_RSTPPSLIMIT 24 /* RST pps limit */
772#define TCPCTL_DELACK_TICKS 25 /* # ticks to delay ACK */
773#define TCPCTL_INIT_WIN_LOCAL 26 /* initial window for local nets */
774#define TCPCTL_IDENT 27 /* rfc 931 identd */
775#define TCPCTL_ACKDROPRATELIMIT 28 /* SYN/RST -> ACK rate limit */
776#define TCPCTL_LOOPBACKCKSUM 29 /* do TCP checksum on loopback */
777#define TCPCTL_STATS 30 /* TCP statistics */
778#define TCPCTL_DEBUG 31 /* TCP debug sockets */
779#define TCPCTL_DEBX 32 /* # of tcp debug sockets */
780#define TCPCTL_DROP 33 /* drop tcp connection */
781#define TCPCTL_MSL 34 /* Max Segment Life */
782
783#ifdef _KERNEL
784
785extern struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
786extern const struct pr_usrreqs tcp_usrreqs;
787
788extern u_int32_t tcp_now; /* for RFC 1323 timestamps */
789extern int tcp_do_rfc1323; /* enabled/disabled? */
790extern int tcp_do_sack; /* SACK enabled/disabled? */
791extern int tcp_do_win_scale; /* RFC1323 window scaling enabled/disabled? */
792extern int tcp_do_timestamps; /* RFC1323 timestamps enabled/disabled? */
793extern int tcp_mssdflt; /* default seg size */
794extern int tcp_minmss; /* minimal seg size */
795extern int tcp_msl; /* max segment life */
796extern int tcp_init_win; /* initial window */
797extern int tcp_init_win_local; /* initial window for local nets */
798extern int tcp_init_win_max[11];/* max sizes for values of tcp_init_win_* */
799extern int tcp_mss_ifmtu; /* take MSS from interface, not in_maxmtu */
800extern int tcp_cwm; /* enable Congestion Window Monitoring */
801extern int tcp_cwm_burstsize; /* burst size allowed by CWM */
802extern int tcp_ack_on_push; /* ACK immediately on PUSH */
803extern int tcp_syn_cache_limit; /* max entries for compressed state engine */
804extern int tcp_syn_bucket_limit;/* max entries per hash bucket */
805extern int tcp_log_refused; /* log refused connections */
806extern int tcp_do_ecn; /* TCP ECN enabled/disabled? */
807extern int tcp_ecn_maxretries; /* Max ECN setup retries */
808extern int tcp_do_rfc1948; /* ISS by cryptographic hash */
809extern int tcp_sack_tp_maxholes; /* Max holes per connection. */
810extern int tcp_sack_globalmaxholes; /* Max holes per system. */
811extern int tcp_sack_globalholes; /* Number of holes present. */
812extern int tcp_do_abc; /* RFC3465 ABC enabled/disabled? */
813extern int tcp_abc_aggressive; /* 1: L=2*SMSS 0: L=1*SMSS */
814
815extern int tcp_msl_enable; /* enable TIME_WAIT truncation */
816extern int tcp_msl_loop; /* MSL for loopback */
817extern int tcp_msl_local; /* MSL for 'local' */
818extern int tcp_msl_remote; /* MSL otherwise */
819extern int tcp_msl_remote_threshold; /* RTT threshold */
820extern int tcp_rttlocal; /* Use RTT to decide who's 'local' */
821extern int tcp4_vtw_enable;
822extern int tcp6_vtw_enable;
823extern int tcp_vtw_was_enabled;
824extern int tcp_vtw_entries;
825
826extern int tcp_rst_ppslim;
827extern int tcp_ackdrop_ppslim;
828
829extern int tcp_syn_cache_size;
830extern struct syn_cache_head tcp_syn_cache[];
831extern u_long syn_cache_count;
832
833#ifdef MBUFTRACE
834extern struct mowner tcp_rx_mowner;
835extern struct mowner tcp_tx_mowner;
836extern struct mowner tcp_reass_mowner;
837extern struct mowner tcp_sock_mowner;
838extern struct mowner tcp_sock_rx_mowner;
839extern struct mowner tcp_sock_tx_mowner;
840extern struct mowner tcp_mowner;
841#endif
842
843extern int tcp_do_autorcvbuf;
844extern int tcp_autorcvbuf_inc;
845extern int tcp_autorcvbuf_max;
846extern int tcp_do_autosndbuf;
847extern int tcp_autosndbuf_inc;
848extern int tcp_autosndbuf_max;
849
850struct secasvar;
851
852void tcp_canceltimers(struct tcpcb *);
853struct tcpcb *
854 tcp_close(struct tcpcb *);
855int tcp_isdead(struct tcpcb *);
856#ifdef INET6
857void *tcp6_ctlinput(int, const struct sockaddr *, void *);
858#endif
859void *tcp_ctlinput(int, const struct sockaddr *, void *);
860int tcp_ctloutput(int, struct socket *, struct sockopt *);
861struct tcpcb *
862 tcp_disconnect1(struct tcpcb *);
863struct tcpcb *
864 tcp_drop(struct tcpcb *, int);
865#ifdef TCP_SIGNATURE
866int tcp_signature_apply(void *, void *, u_int);
867struct secasvar *tcp_signature_getsav(struct mbuf *);
868int tcp_signature(struct mbuf *, struct tcphdr *, int, struct secasvar *,
869 char *);
870#endif
871void tcp_drain(void);
872void tcp_drainstub(void);
873void tcp_established(struct tcpcb *);
874void tcp_init(void);
875void tcp_init_common(unsigned);
876#ifdef INET6
877int tcp6_input(struct mbuf **, int *, int);
878#endif
879void tcp_input(struct mbuf *, int, int);
880u_int tcp_hdrsz(struct tcpcb *);
881u_long tcp_mss_to_advertise(const struct ifnet *, int);
882void tcp_mss_from_peer(struct tcpcb *, int);
883void tcp_tcpcb_template(void);
884struct tcpcb *
885 tcp_newtcpcb(int, void *);
886void tcp_notify(struct inpcb *, int);
887#ifdef INET6
888void tcp6_notify(struct in6pcb *, int);
889#endif
890u_int tcp_optlen(struct tcpcb *);
891int tcp_output(struct tcpcb *);
892void tcp_pulloutofband(struct socket *,
893 struct tcphdr *, struct mbuf *, int);
894void tcp_quench(struct inpcb *);
895#ifdef INET6
896void tcp6_quench(struct in6pcb *);
897#endif
898void tcp_mtudisc(struct inpcb *, int);
899#ifdef INET6
900void tcp6_mtudisc_callback(struct in6_addr *);
901#endif
902
903void tcpipqent_init(void);
904struct ipqent *tcpipqent_alloc(void);
905void tcpipqent_free(struct ipqent *);
906
907int tcp_respond(struct tcpcb *, struct mbuf *, struct mbuf *,
908 struct tcphdr *, tcp_seq, tcp_seq, int);
909void tcp_rmx_rtt(struct tcpcb *);
910void tcp_setpersist(struct tcpcb *);
911#ifdef TCP_SIGNATURE
912int tcp_signature_compute(struct mbuf *, struct tcphdr *, int, int,
913 int, u_char *, u_int);
914#endif
915void tcp_fasttimo(void);
916struct mbuf *
917 tcp_template(struct tcpcb *);
918void tcp_trace(short, short, struct tcpcb *, struct mbuf *, int);
919struct tcpcb *
920 tcp_usrclosed(struct tcpcb *);
921void tcp_usrreq_init(void);
922void tcp_xmit_timer(struct tcpcb *, uint32_t);
923tcp_seq tcp_new_iss(struct tcpcb *, tcp_seq);
924tcp_seq tcp_new_iss1(void *, void *, u_int16_t, u_int16_t, size_t,
925 tcp_seq);
926
927void tcp_sack_init(void);
928void tcp_new_dsack(struct tcpcb *, tcp_seq, u_int32_t);
929void tcp_sack_option(struct tcpcb *, const struct tcphdr *,
930 const u_char *, int);
931void tcp_del_sackholes(struct tcpcb *, const struct tcphdr *);
932void tcp_free_sackholes(struct tcpcb *);
933void tcp_sack_adjust(struct tcpcb *tp);
934struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
935int tcp_sack_numblks(const struct tcpcb *);
936#define TCP_SACK_OPTLEN(nblks) ((nblks) * 8 + 2 + 2)
937
938void tcp_statinc(u_int);
939void tcp_statadd(u_int, uint64_t);
940
941int syn_cache_add(struct sockaddr *, struct sockaddr *,
942 struct tcphdr *, unsigned int, struct socket *,
943 struct mbuf *, u_char *, int, struct tcp_opt_info *);
944void syn_cache_unreach(const struct sockaddr *, const struct sockaddr *,
945 struct tcphdr *);
946struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *,
947 struct tcphdr *, struct socket *so, struct mbuf *);
948void syn_cache_init(void);
949void syn_cache_insert(struct syn_cache *, struct tcpcb *);
950struct syn_cache *syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
951 struct syn_cache_head **);
952void syn_cache_reset(struct sockaddr *, struct sockaddr *,
953 struct tcphdr *);
954int syn_cache_respond(struct syn_cache *);
955void syn_cache_cleanup(struct tcpcb *);
956
957int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
958 int);
959#endif
960
961#endif /* !_NETINET_TCP_VAR_H_ */
962