VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/tcp_input.c@ 15890

最後變更 在這個檔案從15890是 15890,由 vboxsync 提交於 16 年 前

NAT: 1. wo sync enhancement branch is still functional (was corrupted with using ICMP file handler in select(1))

  1. after sending send queue doesn't need to synchronize with NAT thread to free mbuf instead NAT queue used to call freeing slirp routine.
  2. no more copying on slirp to guest sent.


  • 屬性 svn:eol-style 設為 native
檔案大小: 59.3 KB
 
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
34 * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
35 */
36
37/*
38 * Changes and additions relating to SLiRP
39 * Copyright (c) 1995 Danny Gasparovski.
40 *
41 * Please read the file COPYRIGHT for the
42 * terms and conditions of the copyright.
43 */
44
45#include <slirp.h>
46#include "ip_icmp.h"
47
48
49#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
50
51/* for modulo comparisons of timestamps */
52#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
53#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
54
55#ifndef TCP_ACK_HACK
56#define DELAY_ACK(tp, ti) \
57 if (ti->ti_flags & TH_PUSH) \
58 tp->t_flags |= TF_ACKNOW; \
59 else \
60 tp->t_flags |= TF_DELACK;
61#else /* !TCP_ACK_HACK */
62#define DELAY_ACK(tp, ign) \
63 tp->t_flags |= TF_DELACK;
64#endif /* TCP_ACK_HACK */
65
66
67/*
68 * deps: netinet/tcp_reass.c
69 * tcp_reass_maxqlen = 48 (deafault)
70 * tcp_reass_maxseg = nmbclusters/16 (nmbclusters = 1024 + maxusers * 64 from kern/kern_mbuf.c let's say 256)
71 */
72int
73tcp_reass(PNATState pData, struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
74{
75 struct tseg_qent *q;
76 struct tseg_qent *p = NULL;
77 struct tseg_qent *nq;
78 struct tseg_qent *te = NULL;
79 struct socket *so = tp->t_socket;
80 int flags;
81
82 /*
83 * XXX: tcp_reass() is rather inefficient with its data structures
84 * and should be rewritten (see NetBSD for optimizations). While
85 * doing that it should move to its own file tcp_reass.c.
86 */
87
88 /*
89 * Call with th==NULL after become established to
90 * force pre-ESTABLISHED data up to user socket.
91 */
92 if (th == NULL)
93 goto present;
94
95 /*
96 * Limit the number of segments in the reassembly queue to prevent
97 * holding on to too many segments (and thus running out of mbufs).
98 * Make sure to let the missing segment through which caused this
99 * queue. Always keep one global queue entry spare to be able to
100 * process the missing segment.
101 */
102 if ( th->th_seq != tp->rcv_nxt
103 && ( tcp_reass_qsize + 1 >= tcp_reass_maxseg
104 || tp->t_segqlen >= tcp_reass_maxqlen))
105 {
106 tcp_reass_overflows++;
107 tcpstat.tcps_rcvmemdrop++;
108 m_freem(pData, m);
109 *tlenp = 0;
110 return (0);
111 }
112
113 /*
114 * Allocate a new queue entry. If we can't, or hit the zone limit
115 * just drop the pkt.
116 */
117 te = RTMemAlloc(sizeof(struct tseg_qent));
118 if (te == NULL)
119 {
120 tcpstat.tcps_rcvmemdrop++;
121 m_freem(pData, m);
122 *tlenp = 0;
123 return (0);
124 }
125 tp->t_segqlen++;
126 tcp_reass_qsize++;
127
128 /*
129 * Find a segment which begins after this one does.
130 */
131 LIST_FOREACH(q, &tp->t_segq, tqe_q)
132 {
133 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
134 break;
135 p = q;
136 }
137
138 /*
139 * If there is a preceding segment, it may provide some of
140 * our data already. If so, drop the data from the incoming
141 * segment. If it provides all of our data, drop us.
142 */
143 if (p != NULL)
144 {
145 int i;
146 /* conversion to int (in i) handles seq wraparound */
147 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
148 if (i > 0)
149 {
150 if (i >= *tlenp)
151 {
152 tcpstat.tcps_rcvduppack++;
153 tcpstat.tcps_rcvdupbyte += *tlenp;
154 m_freem(pData, m);
155 RTMemFree(te);
156 tp->t_segqlen--;
157 tcp_reass_qsize--;
158 /*
159 * Try to present any queued data
160 * at the left window edge to the user.
161 * This is needed after the 3-WHS
162 * completes.
163 */
164 goto present; /* ??? */
165 }
166 m_adj(m, i);
167 *tlenp -= i;
168 th->th_seq += i;
169 }
170 }
171 tcpstat.tcps_rcvoopack++;
172 tcpstat.tcps_rcvoobyte += *tlenp;
173
174 /*
175 * While we overlap succeeding segments trim them or,
176 * if they are completely covered, dequeue them.
177 */
178 while (q)
179 {
180 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
181 if (i <= 0)
182 break;
183 if (i < q->tqe_len)
184 {
185 q->tqe_th->th_seq += i;
186 q->tqe_len -= i;
187 m_adj(q->tqe_m, i);
188 break;
189 }
190
191 nq = LIST_NEXT(q, tqe_q);
192 LIST_REMOVE(q, tqe_q);
193 m_freem(pData, q->tqe_m);
194 RTMemFree(q);
195 tp->t_segqlen--;
196 tcp_reass_qsize--;
197 q = nq;
198 }
199
200 /* Insert the new segment queue entry into place. */
201 te->tqe_m = m;
202 te->tqe_th = th;
203 te->tqe_len = *tlenp;
204
205 if (p == NULL)
206 {
207 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
208 }
209 else
210 {
211 LIST_INSERT_AFTER(p, te, tqe_q);
212 }
213
214present:
215 /*
216 * Present data to user, advancing rcv_nxt through
217 * completed sequence space.
218 */
219 if (!TCPS_HAVEESTABLISHED(tp->t_state))
220 return (0);
221 q = LIST_FIRST(&tp->t_segq);
222 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
223 return (0);
224 do
225 {
226 tp->rcv_nxt += q->tqe_len;
227 flags = q->tqe_th->th_flags & TH_FIN;
228 nq = LIST_NEXT(q, tqe_q);
229 LIST_REMOVE(q, tqe_q);
230 /* XXX: This place should be checked for the same code in
231 * original BSD code for Slirp and current BSD used SS_FCANTRCVMORE
232 */
233 if (so->so_state & SS_FCANTSENDMORE)
234 m_freem(pData, q->tqe_m);
235 else
236 sbappend(pData, so, q->tqe_m);
237 RTMemFree(q);
238 tp->t_segqlen--;
239 tcp_reass_qsize--;
240 q = nq;
241 }
242 while (q && q->tqe_th->th_seq == tp->rcv_nxt);
243
244 return flags;
245}
246
247/*
248 * TCP input routine, follows pages 65-76 of the
249 * protocol specification dated September, 1981 very closely.
250 */
251void
252tcp_input(PNATState pData, register struct mbuf *m, int iphlen, struct socket *inso)
253{
254 struct ip save_ip, *ip;
255 register struct tcpiphdr *ti;
256 caddr_t optp = NULL;
257 int optlen = 0;
258 int len, tlen, off;
259 register struct tcpcb *tp = 0;
260 register int tiflags;
261 struct socket *so = 0;
262 int todrop, acked, ourfinisacked, needoutput = 0;
263/* int dropsocket = 0; */
264 int iss = 0;
265 u_long tiwin;
266/* int ts_present = 0; */
267
268 DEBUG_CALL("tcp_input");
269 DEBUG_ARGS((dfd," m = %8lx iphlen = %2d inso = %lx\n",
270 (long )m, iphlen, (long )inso ));
271
272 /*
273 * If called with m == 0, then we're continuing the connect
274 */
275 if (m == NULL)
276 {
277 so = inso;
278
279 /* Re-set a few variables */
280 tp = sototcpcb(so);
281 m = so->so_m;
282 so->so_m = 0;
283 ti = so->so_ti;
284 tiwin = ti->ti_win;
285 tiflags = ti->ti_flags;
286
287 goto cont_conn;
288 }
289
290 tcpstat.tcps_rcvtotal++;
291 /*
292 * Get IP and TCP header together in first mbuf.
293 * Note: IP leaves IP header in first mbuf.
294 */
295 ti = mtod(m, struct tcpiphdr *);
296 if (iphlen > sizeof(struct ip ))
297 {
298 ip_stripoptions(m, (struct mbuf *)0);
299 iphlen = sizeof(struct ip );
300 }
301 /* XXX Check if too short */
302
303
304 /*
305 * Save a copy of the IP header in case we want restore it
306 * for sending an ICMP error message in response.
307 */
308 ip = mtod(m, struct ip *);
309 save_ip = *ip;
310 save_ip.ip_len+= iphlen;
311
312 /*
313 * Checksum extended TCP header and data.
314 */
315 tlen = ((struct ip *)ti)->ip_len;
316 memset(ti->ti_x1, 0, 9);
317 ti->ti_len = htons((u_int16_t)tlen);
318 len = sizeof(struct ip ) + tlen;
319 /* keep checksum for ICMP reply
320 * ti->ti_sum = cksum(m, len);
321 * if (ti->ti_sum) { */
322 if (cksum(m, len))
323 {
324 tcpstat.tcps_rcvbadsum++;
325 Log2(("checksum is invalid => drop\n"));
326 goto drop;
327 }
328
329 /*
330 * Check that TCP offset makes sense,
331 * pull out TCP options and adjust length. XXX
332 */
333 off = ti->ti_off << 2;
334 if ( off < sizeof (struct tcphdr)
335 || off > tlen)
336 {
337 tcpstat.tcps_rcvbadoff++;
338 Log2(("ti_off(tlen(%d)<%d<(tcphdr(%d))) is invalid =>drop\n", tlen, off, sizeof(struct tcphdr)));
339 goto drop;
340 }
341 tlen -= off;
342 ti->ti_len = tlen;
343 if (off > sizeof (struct tcphdr))
344 {
345 optlen = off - sizeof (struct tcphdr);
346 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
347
348 /*
349 * Do quick retrieval of timestamp options ("options
350 * prediction?"). If timestamp is the only option and it's
351 * formatted as recommended in RFC 1323 appendix A, we
352 * quickly get the values now and not bother calling
353 * tcp_dooptions(), etc.
354 */
355#if 0
356 if (( optlen == TCPOLEN_TSTAMP_APPA
357 || ( optlen > TCPOLEN_TSTAMP_APPA
358 && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
359 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
360 (ti->ti_flags & TH_SYN) == 0)
361 {
362 ts_present = 1;
363 ts_val = ntohl(*(u_int32_t *)(optp + 4));
364 ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
365 optp = NULL; / * we have parsed the options * /
366 }
367#endif
368 }
369 tiflags = ti->ti_flags;
370
371 /*
372 * Convert TCP protocol specific fields to host format.
373 */
374 NTOHL(ti->ti_seq);
375 NTOHL(ti->ti_ack);
376 NTOHS(ti->ti_win);
377 NTOHS(ti->ti_urp);
378
379 /*
380 * Drop TCP, IP headers and TCP options.
381 */
382 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
383 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
384
385 /*
386 * Locate pcb for segment.
387 */
388findso:
389 so = tcp_last_so;
390 if ( so->so_fport != ti->ti_dport
391 || so->so_lport != ti->ti_sport
392 || so->so_laddr.s_addr != ti->ti_src.s_addr
393 || so->so_faddr.s_addr != ti->ti_dst.s_addr)
394 {
395 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
396 ti->ti_dst, ti->ti_dport);
397 if (so)
398 tcp_last_so = so;
399 ++tcpstat.tcps_socachemiss;
400 }
401
402 /*
403 * If the state is CLOSED (i.e., TCB does not exist) then
404 * all data in the incoming segment is discarded.
405 * If the TCB exists but is in CLOSED state, it is embryonic,
406 * but should either do a listen or a connect soon.
407 *
408 * state == CLOSED means we've done socreate() but haven't
409 * attached it to a protocol yet...
410 *
411 * XXX If a TCB does not exist, and the TH_SYN flag is
412 * the only flag set, then create a session, mark it
413 * as if it was LISTENING, and continue...
414 */
415 if (so == 0)
416 {
417 if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
418 goto dropwithreset;
419
420 if ((so = socreate()) == NULL)
421 goto dropwithreset;
422 if (tcp_attach(pData, so) < 0)
423 {
424 RTMemFree(so); /* Not sofree (if it failed, it's not insqued) */
425 goto dropwithreset;
426 }
427
428 sbreserve(&so->so_snd, tcp_sndspace);
429 sbreserve(&so->so_rcv, tcp_rcvspace);
430
431/* tcp_last_so = so; */ /* XXX ? */
432/* tp = sototcpcb(so); */
433
434 so->so_laddr = ti->ti_src;
435 so->so_lport = ti->ti_sport;
436 so->so_faddr = ti->ti_dst;
437 so->so_fport = ti->ti_dport;
438
439 if ((so->so_iptos = tcp_tos(so)) == 0)
440 so->so_iptos = ((struct ip *)ti)->ip_tos;
441
442 tp = sototcpcb(so);
443 tp->t_state = TCPS_LISTEN;
444 }
445
446 /*
447 * If this is a still-connecting socket, this probably
448 * a retransmit of the SYN. Whether it's a retransmit SYN
449 * or something else, we nuke it.
450 */
451 if (so->so_state & SS_ISFCONNECTING)
452 {
453 Log2(("so_state(%x) of %R[natsock] is still connecting =>drop\n", so->so_state, so));
454 goto drop;
455 }
456
457 tp = sototcpcb(so);
458
459 /* XXX Should never fail */
460 if (tp == 0)
461 goto dropwithreset;
462 if (tp->t_state == TCPS_CLOSED)
463 {
464 Log2(("t_state(%x) is closed =>drop\n", tp->t_state));
465 goto drop;
466 }
467
468 /* Unscale the window into a 32-bit value. */
469/* if ((tiflags & TH_SYN) == 0)
470 * tiwin = ti->ti_win << tp->snd_scale;
471 * else
472 */
473 tiwin = ti->ti_win;
474
475 /*
476 * Segment received on connection.
477 * Reset idle time and keep-alive timer.
478 */
479 tp->t_idle = 0;
480 if (so_options)
481 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
482 else
483 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
484
485 /*
486 * Process options if not in LISTEN state,
487 * else do it below (after getting remote address).
488 */
489 if (optp && tp->t_state != TCPS_LISTEN)
490 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
491/* , */
492/* &ts_present, &ts_val, &ts_ecr); */
493
494 /*
495 * Header prediction: check for the two common cases
496 * of a uni-directional data xfer. If the packet has
497 * no control flags, is in-sequence, the window didn't
498 * change and we're not retransmitting, it's a
499 * candidate. If the length is zero and the ack moved
500 * forward, we're the sender side of the xfer. Just
501 * free the data acked & wake any higher level process
502 * that was blocked waiting for space. If the length
503 * is non-zero and the ack didn't move, we're the
504 * receiver side. If we're getting packets in-order
505 * (the reassembly queue is empty), add the data to
506 * the socket buffer and note that we need a delayed ack.
507 *
508 * XXX Some of these tests are not needed
509 * eg: the tiwin == tp->snd_wnd prevents many more
510 * predictions.. with no *real* advantage..
511 */
512 if ( tp->t_state == TCPS_ESTABLISHED
513 && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK
514/* && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) */
515 && ti->ti_seq == tp->rcv_nxt
516 && tiwin && tiwin == tp->snd_wnd
517 && tp->snd_nxt == tp->snd_max)
518 {
519 /*
520 * If last ACK falls within this segment's sequence numbers,
521 * record the timestamp.
522 */
523#if 0
524 if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
525 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len))
526 {
527 tp->ts_recent_age = tcp_now;
528 tp->ts_recent = ts_val;
529 }
530#endif
531
532 if (ti->ti_len == 0)
533 {
534 if ( SEQ_GT(ti->ti_ack, tp->snd_una)
535 && SEQ_LEQ(ti->ti_ack, tp->snd_max)
536 && tp->snd_cwnd >= tp->snd_wnd)
537 {
538 /*
539 * this is a pure ack for outstanding data.
540 */
541 ++tcpstat.tcps_predack;
542#if 0
543 if (ts_present)
544 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
545 else
546#endif
547 if ( tp->t_rtt
548 && SEQ_GT(ti->ti_ack, tp->t_rtseq))
549 tcp_xmit_timer(pData, tp, tp->t_rtt);
550 acked = ti->ti_ack - tp->snd_una;
551 tcpstat.tcps_rcvackpack++;
552 tcpstat.tcps_rcvackbyte += acked;
553 sbdrop(&so->so_snd, acked);
554 tp->snd_una = ti->ti_ack;
555 m_freem(pData, m);
556
557 /*
558 * If all outstanding data are acked, stop
559 * retransmit timer, otherwise restart timer
560 * using current (possibly backed-off) value.
561 * If process is waiting for space,
562 * wakeup/selwakeup/signal. If data
563 * are ready to send, let tcp_output
564 * decide between more output or persist.
565 */
566 if (tp->snd_una == tp->snd_max)
567 tp->t_timer[TCPT_REXMT] = 0;
568 else if (tp->t_timer[TCPT_PERSIST] == 0)
569 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
570
571 /*
572 * There's room in so_snd, sowwakup will read()
573 * from the socket if we can
574 */
575#if 0
576 if (so->so_snd.sb_flags & SB_NOTIFY)
577 sowwakeup(so);
578#endif
579 /*
580 * This is called because sowwakeup might have
581 * put data into so_snd. Since we don't so sowwakeup,
582 * we don't need this.. XXX???
583 */
584 if (so->so_snd.sb_cc)
585 (void) tcp_output(pData, tp);
586
587 return;
588 }
589 }
590 else if ( ti->ti_ack == tp->snd_una
591 && LIST_FIRST(&tp->t_segq)
592 && ti->ti_len <= sbspace(&so->so_rcv))
593 {
594 /*
595 * this is a pure, in-sequence data packet
596 * with nothing on the reassembly queue and
597 * we have enough buffer space to take it.
598 */
599 ++tcpstat.tcps_preddat;
600 tp->rcv_nxt += ti->ti_len;
601 tcpstat.tcps_rcvpack++;
602 tcpstat.tcps_rcvbyte += ti->ti_len;
603 /*
604 * Add data to socket buffer.
605 */
606 if (so->so_emu)
607 {
608 if (tcp_emu(pData, so,m)) sbappend(pData, so, m);
609 }
610 else
611 sbappend(pData, so, m);
612
613 /*
614 * XXX This is called when data arrives. Later, check
615 * if we can actually write() to the socket
616 * XXX Need to check? It's be NON_BLOCKING
617 */
618/* sorwakeup(so); */
619
620 /*
621 * If this is a short packet, then ACK now - with Nagel
622 * congestion avoidance sender won't send more until
623 * he gets an ACK.
624 *
625 * It is better to not delay acks at all to maximize
626 * TCP throughput. See RFC 2581.
627 */
628 tp->t_flags |= TF_ACKNOW;
629 tcp_output(pData, tp);
630 return;
631 }
632 } /* header prediction */
633 /*
634 * Calculate amount of space in receive window,
635 * and then do TCP input processing.
636 * Receive window is amount of space in rcv queue,
637 * but not less than advertised window.
638 */
639 {
640 int win;
641 win = sbspace(&so->so_rcv);
642 if (win < 0)
643 win = 0;
644 tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
645 }
646
647 switch (tp->t_state)
648 {
649 /*
650 * If the state is LISTEN then ignore segment if it contains an RST.
651 * If the segment contains an ACK then it is bad and send a RST.
652 * If it does not contain a SYN then it is not interesting; drop it.
653 * Don't bother responding if the destination was a broadcast.
654 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
655 * tp->iss, and send a segment:
656 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
657 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
658 * Fill in remote peer address fields if not previously specified.
659 * Enter SYN_RECEIVED state, and process any other fields of this
660 * segment in this state.
661 */
662 case TCPS_LISTEN:
663 {
664 if (tiflags & TH_RST) {
665 Log2(("RST(%x) is on listen =>drop\n", tiflags));
666 goto drop;
667 }
668 if (tiflags & TH_ACK)
669 goto dropwithreset;
670 if ((tiflags & TH_SYN) == 0)
671 {
672 Log2(("SYN(%x) is off on listen =>drop\n", tiflags));
673 goto drop;
674 }
675
676 /*
677 * This has way too many gotos...
678 * But a bit of spaghetti code never hurt anybody :)
679 */
680
681 if (so->so_emu & EMU_NOCONNECT)
682 {
683 so->so_emu &= ~EMU_NOCONNECT;
684 goto cont_input;
685 }
686
687 if ( (tcp_fconnect(pData, so) == -1)
688 && errno != EINPROGRESS
689 && errno != EWOULDBLOCK)
690 {
691 u_char code = ICMP_UNREACH_NET;
692 DEBUG_MISC((dfd," tcp fconnect errno = %d-%s\n",
693 errno,strerror(errno)));
694 if (errno == ECONNREFUSED)
695 {
696 /* ACK the SYN, send RST to refuse the connection */
697 tcp_respond(pData, tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
698 TH_RST|TH_ACK);
699 }
700 else
701 {
702 if (errno == EHOSTUNREACH)
703 code = ICMP_UNREACH_HOST;
704 HTONL(ti->ti_seq); /* restore tcp header */
705 HTONL(ti->ti_ack);
706 HTONS(ti->ti_win);
707 HTONS(ti->ti_urp);
708 m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
709 m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
710 *ip = save_ip;
711 icmp_error(pData, m, ICMP_UNREACH,code, 0,strerror(errno));
712 }
713 tp = tcp_close(pData, tp);
714 m_free(pData, m);
715 }
716 else
717 {
718 /*
719 * Haven't connected yet, save the current mbuf
720 * and ti, and return
721 * XXX Some OS's don't tell us whether the connect()
722 * succeeded or not. So we must time it out.
723 */
724 so->so_m = m;
725 so->so_ti = ti;
726 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
727 tp->t_state = TCPS_SYN_RECEIVED;
728 }
729 return;
730
731cont_conn:
732 /* m==NULL
733 * Check if the connect succeeded
734 */
735 if (so->so_state & SS_NOFDREF)
736 {
737 tp = tcp_close(pData, tp);
738 goto dropwithreset;
739 }
740cont_input:
741 tcp_template(tp);
742
743 if (optp)
744 tcp_dooptions(pData, tp, (u_char *)optp, optlen, ti);
745
746 if (iss)
747 tp->iss = iss;
748 else
749 tp->iss = tcp_iss;
750 tcp_iss += TCP_ISSINCR/2;
751 tp->irs = ti->ti_seq;
752 tcp_sendseqinit(tp);
753 tcp_rcvseqinit(tp);
754 tp->t_flags |= TF_ACKNOW;
755 tp->t_state = TCPS_SYN_RECEIVED;
756 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
757 tcpstat.tcps_accepts++;
758 goto trimthenstep6;
759 } /* case TCPS_LISTEN */
760
761 /*
762 * If the state is SYN_SENT:
763 * if seg contains an ACK, but not for our SYN, drop the input.
764 * if seg contains a RST, then drop the connection.
765 * if seg does not contain SYN, then drop it.
766 * Otherwise this is an acceptable SYN segment
767 * initialize tp->rcv_nxt and tp->irs
768 * if seg contains ack then advance tp->snd_una
769 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
770 * arrange for segment to be acked (eventually)
771 * continue processing rest of data/controls, beginning with URG
772 */
773 case TCPS_SYN_SENT:
774 if ( (tiflags & TH_ACK)
775 && ( SEQ_LEQ(ti->ti_ack, tp->iss)
776 || SEQ_GT(ti->ti_ack, tp->snd_max)))
777 goto dropwithreset;
778
779 if (tiflags & TH_RST)
780 {
781 if (tiflags & TH_ACK)
782 tp = tcp_drop(pData, tp,0); /* XXX Check t_softerror! */
783 Log2(("RST(%x) is on SYN_SENT =>drop\n", tiflags));
784 goto drop;
785 }
786
787 if ((tiflags & TH_SYN) == 0)
788 {
789 Log2(("SYN(%x) bit is off on SYN_SENT =>drop\n", tiflags));
790 goto drop;
791 }
792 if (tiflags & TH_ACK)
793 {
794 tp->snd_una = ti->ti_ack;
795 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
796 tp->snd_nxt = tp->snd_una;
797 }
798
799 tp->t_timer[TCPT_REXMT] = 0;
800 tp->irs = ti->ti_seq;
801 tcp_rcvseqinit(tp);
802 tp->t_flags |= TF_ACKNOW;
803 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss))
804 {
805 tcpstat.tcps_connects++;
806 soisfconnected(so);
807 tp->t_state = TCPS_ESTABLISHED;
808
809 /* Do window scaling on this connection? */
810#if 0
811 if (( tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
812 == (TF_RCVD_SCALE|TF_REQ_SCALE))
813 {
814 tp->snd_scale = tp->requested_s_scale;
815 tp->rcv_scale = tp->request_r_scale;
816 }
817#endif
818 (void) tcp_reass(pData, tp, (struct tcphdr *)0, NULL, (struct mbuf *)0);
819 /*
820 * if we didn't have to retransmit the SYN,
821 * use its rtt as our initial srtt & rtt var.
822 */
823 if (tp->t_rtt)
824 tcp_xmit_timer(pData, tp, tp->t_rtt);
825 }
826 else
827 tp->t_state = TCPS_SYN_RECEIVED;
828
829trimthenstep6:
830 /*
831 * Advance ti->ti_seq to correspond to first data byte.
832 * If data, trim to stay within window,
833 * dropping FIN if necessary.
834 */
835 ti->ti_seq++;
836 if (ti->ti_len > tp->rcv_wnd)
837 {
838 todrop = ti->ti_len - tp->rcv_wnd;
839 m_adj(m, -todrop);
840 ti->ti_len = tp->rcv_wnd;
841 tiflags &= ~TH_FIN;
842 tcpstat.tcps_rcvpackafterwin++;
843 tcpstat.tcps_rcvbyteafterwin += todrop;
844 }
845 tp->snd_wl1 = ti->ti_seq - 1;
846 tp->rcv_up = ti->ti_seq;
847 goto step6;
848 } /* switch tp->t_state */
849 /*
850 * States other than LISTEN or SYN_SENT.
851 * First check timestamp, if present.
852 * Then check that at least some bytes of segment are within
853 * receive window. If segment begins before rcv_nxt,
854 * drop leading data (and SYN); if nothing left, just ack.
855 *
856 * RFC 1323 PAWS: If we have a timestamp reply on this segment
857 * and it's less than ts_recent, drop it.
858 */
859#if 0
860 if ( ts_present
861 && (tiflags & TH_RST) == 0
862 && tp->ts_recent
863 && TSTMP_LT(ts_val, tp->ts_recent))
864 {
865 /* Check to see if ts_recent is over 24 days old. */
866 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE)
867 {
868 /*
869 * Invalidate ts_recent. If this segment updates
870 * ts_recent, the age will be reset later and ts_recent
871 * will get a valid value. If it does not, setting
872 * ts_recent to zero will at least satisfy the
873 * requirement that zero be placed in the timestamp
874 * echo reply when ts_recent isn't valid. The
875 * age isn't reset until we get a valid ts_recent
876 * because we don't want out-of-order segments to be
877 * dropped when ts_recent is old.
878 */
879 tp->ts_recent = 0;
880 }
881 else
882 {
883 tcpstat.tcps_rcvduppack++;
884 tcpstat.tcps_rcvdupbyte += ti->ti_len;
885 tcpstat.tcps_pawsdrop++;
886 goto dropafterack;
887 }
888 }
889#endif
890
891 todrop = tp->rcv_nxt - ti->ti_seq;
892 if (todrop > 0)
893 {
894 if (tiflags & TH_SYN)
895 {
896 tiflags &= ~TH_SYN;
897 ti->ti_seq++;
898 if (ti->ti_urp > 1)
899 ti->ti_urp--;
900 else
901 tiflags &= ~TH_URG;
902 todrop--;
903 }
904 /*
905 * Following if statement from Stevens, vol. 2, p. 960.
906 */
907 if ( todrop > ti->ti_len
908 || ( todrop == ti->ti_len
909 && (tiflags & TH_FIN) == 0))
910 {
911 /*
912 * Any valid FIN must be to the left of the window.
913 * At this point the FIN must be a duplicate or out
914 * of sequence; drop it.
915 */
916 tiflags &= ~TH_FIN;
917
918 /*
919 * Send an ACK to resynchronize and drop any data.
920 * But keep on processing for RST or ACK.
921 */
922 tp->t_flags |= TF_ACKNOW;
923 todrop = ti->ti_len;
924 tcpstat.tcps_rcvduppack++;
925 tcpstat.tcps_rcvdupbyte += todrop;
926 }
927 else
928 {
929 tcpstat.tcps_rcvpartduppack++;
930 tcpstat.tcps_rcvpartdupbyte += todrop;
931 }
932 m_adj(m, todrop);
933 ti->ti_seq += todrop;
934 ti->ti_len -= todrop;
935 if (ti->ti_urp > todrop)
936 ti->ti_urp -= todrop;
937 else
938 {
939 tiflags &= ~TH_URG;
940 ti->ti_urp = 0;
941 }
942 }
943 /*
944 * If new data are received on a connection after the
945 * user processes are gone, then RST the other end.
946 */
947 if ( (so->so_state & SS_NOFDREF)
948 && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len)
949 {
950 tp = tcp_close(pData, tp);
951 tcpstat.tcps_rcvafterclose++;
952 goto dropwithreset;
953 }
954
955 /*
956 * If segment ends after window, drop trailing data
957 * (and PUSH and FIN); if nothing left, just ACK.
958 */
959 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
960 if (todrop > 0)
961 {
962 tcpstat.tcps_rcvpackafterwin++;
963 if (todrop >= ti->ti_len)
964 {
965 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
966 /*
967 * If a new connection request is received
968 * while in TIME_WAIT, drop the old connection
969 * and start over if the sequence numbers
970 * are above the previous ones.
971 */
972 if ( tiflags & TH_SYN
973 && tp->t_state == TCPS_TIME_WAIT
974 && SEQ_GT(ti->ti_seq, tp->rcv_nxt))
975 {
976 iss = tp->rcv_nxt + TCP_ISSINCR;
977 tp = tcp_close(pData, tp);
978 goto findso;
979 }
980 /*
981 * If window is closed can only take segments at
982 * window edge, and have to drop data and PUSH from
983 * incoming segments. Continue processing, but
984 * remember to ack. Otherwise, drop segment
985 * and ack.
986 */
987 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt)
988 {
989 tp->t_flags |= TF_ACKNOW;
990 tcpstat.tcps_rcvwinprobe++;
991 }
992 else
993 goto dropafterack;
994 }
995 else
996 tcpstat.tcps_rcvbyteafterwin += todrop;
997 m_adj(m, -todrop);
998 ti->ti_len -= todrop;
999 tiflags &= ~(TH_PUSH|TH_FIN);
1000 }
1001
1002 /*
1003 * If last ACK falls within this segment's sequence numbers,
1004 * record its timestamp.
1005 */
1006#if 0
1007 if ( ts_present
1008 && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)
1009 && SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1010 {
1011 tp->ts_recent_age = tcp_now;
1012 tp->ts_recent = ts_val;
1013 }
1014#endif
1015
1016 /*
1017 * If the RST bit is set examine the state:
1018 * SYN_RECEIVED STATE:
1019 * If passive open, return to LISTEN state.
1020 * If active open, inform user that connection was refused.
1021 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1022 * Inform user that connection was reset, and close tcb.
1023 * CLOSING, LAST_ACK, TIME_WAIT STATES
1024 * Close the tcb.
1025 */
1026 if (tiflags&TH_RST)
1027 switch (tp->t_state)
1028 {
1029 case TCPS_SYN_RECEIVED:
1030/* so->so_error = ECONNREFUSED; */
1031 goto close;
1032
1033 case TCPS_ESTABLISHED:
1034 case TCPS_FIN_WAIT_1:
1035 case TCPS_FIN_WAIT_2:
1036 case TCPS_CLOSE_WAIT:
1037/* so->so_error = ECONNRESET; */
1038close:
1039 Log2(("closing...=>drop\n", tp->t_state));
1040 tp->t_state = TCPS_CLOSED;
1041 tcpstat.tcps_drops++;
1042 tp = tcp_close(pData, tp);
1043 goto drop;
1044
1045 case TCPS_CLOSING:
1046 case TCPS_LAST_ACK:
1047 case TCPS_TIME_WAIT:
1048 Log2(("t_state is (%x) sort of close =>drop\n", tp->t_state));
1049 tp = tcp_close(pData, tp);
1050 goto drop;
1051 }
1052
1053 /*
1054 * If a SYN is in the window, then this is an
1055 * error and we send an RST and drop the connection.
1056 */
1057 if (tiflags & TH_SYN)
1058 {
1059 tp = tcp_drop(pData, tp,0);
1060 goto dropwithreset;
1061 }
1062
1063 /*
1064 * If the ACK bit is off we drop the segment and return.
1065 */
1066 if ((tiflags & TH_ACK) == 0)
1067 {
1068 Log2(("ACK(%x) bit is off =>drop\n", tiflags));
1069 goto drop;
1070 }
1071
1072 /*
1073 * Ack processing.
1074 */
1075 switch (tp->t_state)
1076 {
1077 /*
1078 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1079 * ESTABLISHED state and continue processing, otherwise
1080 * send an RST. una<=ack<=max
1081 */
1082 case TCPS_SYN_RECEIVED:
1083 if ( SEQ_GT(tp->snd_una, ti->ti_ack)
1084 || SEQ_GT(ti->ti_ack, tp->snd_max))
1085 goto dropwithreset;
1086 tcpstat.tcps_connects++;
1087 tp->t_state = TCPS_ESTABLISHED;
1088 /*
1089 * The sent SYN is ack'ed with our sequence number +1
1090 * The first data byte already in the buffer will get
1091 * lost if no correction is made. This is only needed for
1092 * SS_CTL since the buffer is empty otherwise.
1093 * tp->snd_una++; or:
1094 */
1095 tp->snd_una = ti->ti_ack;
1096 soisfconnected(so);
1097
1098 /* Do window scaling? */
1099#if 0
1100 if ( (tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE))
1101 == (TF_RCVD_SCALE|TF_REQ_SCALE))
1102 {
1103 tp->snd_scale = tp->requested_s_scale;
1104 tp->rcv_scale = tp->request_r_scale;
1105 }
1106#endif
1107 (void) tcp_reass(pData, tp, (struct tcphdr *)0, (int *)0, (struct mbuf *)0);
1108 tp->snd_wl1 = ti->ti_seq - 1;
1109 /* Avoid ack processing; snd_una==ti_ack => dup ack */
1110 goto synrx_to_est;
1111 /* fall into ... */
1112
1113 /*
1114 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1115 * ACKs. If the ack is in the range
1116 * tp->snd_una < ti->ti_ack <= tp->snd_max
1117 * then advance tp->snd_una to ti->ti_ack and drop
1118 * data from the retransmission queue. If this ACK reflects
1119 * more up to date window information we update our window information.
1120 */
1121 case TCPS_ESTABLISHED:
1122 case TCPS_FIN_WAIT_1:
1123 case TCPS_FIN_WAIT_2:
1124 case TCPS_CLOSE_WAIT:
1125 case TCPS_CLOSING:
1126 case TCPS_LAST_ACK:
1127 case TCPS_TIME_WAIT:
1128 if (SEQ_LEQ(ti->ti_ack, tp->snd_una))
1129 {
1130 if (ti->ti_len == 0 && tiwin == tp->snd_wnd)
1131 {
1132 tcpstat.tcps_rcvdupack++;
1133 DEBUG_MISC((dfd," dup ack m = %lx so = %lx \n",
1134 (long )m, (long )so));
1135 /*
1136 * If we have outstanding data (other than
1137 * a window probe), this is a completely
1138 * duplicate ack (ie, window info didn't
1139 * change), the ack is the biggest we've
1140 * seen and we've seen exactly our rexmt
1141 * threshold of them, assume a packet
1142 * has been dropped and retransmit it.
1143 * Kludge snd_nxt & the congestion
1144 * window so we send only this one
1145 * packet.
1146 *
1147 * We know we're losing at the current
1148 * window size so do congestion avoidance
1149 * (set ssthresh to half the current window
1150 * and pull our congestion window back to
1151 * the new ssthresh).
1152 *
1153 * Dup acks mean that packets have left the
1154 * network (they're now cached at the receiver)
1155 * so bump cwnd by the amount in the receiver
1156 * to keep a constant cwnd packets in the
1157 * network.
1158 */
1159 if ( tp->t_timer[TCPT_REXMT] == 0
1160 || ti->ti_ack != tp->snd_una)
1161 tp->t_dupacks = 0;
1162 else if (++tp->t_dupacks == tcprexmtthresh)
1163 {
1164 tcp_seq onxt = tp->snd_nxt;
1165 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
1166 if (win < 2)
1167 win = 2;
1168 tp->snd_ssthresh = win * tp->t_maxseg;
1169 tp->t_timer[TCPT_REXMT] = 0;
1170 tp->t_rtt = 0;
1171 tp->snd_nxt = ti->ti_ack;
1172 tp->snd_cwnd = tp->t_maxseg;
1173 (void) tcp_output(pData, tp);
1174 tp->snd_cwnd = tp->snd_ssthresh +
1175 tp->t_maxseg * tp->t_dupacks;
1176 if (SEQ_GT(onxt, tp->snd_nxt))
1177 tp->snd_nxt = onxt;
1178 Log2(("t_dupacks(%d) == tcprexmtthresh(%d)=>drop\n", tp->t_dupacks, tcprexmtthresh));
1179 goto drop;
1180 }
1181 else if (tp->t_dupacks > tcprexmtthresh)
1182 {
1183 tp->snd_cwnd += tp->t_maxseg;
1184 (void) tcp_output(pData, tp);
1185 Log2(("t_dupacks(%d) > tcprexmtthresh(%d)=>drop\n", tp->t_dupacks, tcprexmtthresh));
1186 goto drop;
1187 }
1188 }
1189 else
1190 tp->t_dupacks = 0;
1191 break;
1192 }
1193synrx_to_est:
1194 /*
1195 * If the congestion window was inflated to account
1196 * for the other side's cached packets, retract it.
1197 */
1198 if ( tp->t_dupacks > tcprexmtthresh
1199 && tp->snd_cwnd > tp->snd_ssthresh)
1200 tp->snd_cwnd = tp->snd_ssthresh;
1201 tp->t_dupacks = 0;
1202 if (SEQ_GT(ti->ti_ack, tp->snd_max))
1203 {
1204 tcpstat.tcps_rcvacktoomuch++;
1205 goto dropafterack;
1206 }
1207 acked = ti->ti_ack - tp->snd_una;
1208 tcpstat.tcps_rcvackpack++;
1209 tcpstat.tcps_rcvackbyte += acked;
1210
1211 /*
1212 * If we have a timestamp reply, update smoothed
1213 * round trip time. If no timestamp is present but
1214 * transmit timer is running and timed sequence
1215 * number was acked, update smoothed round trip time.
1216 * Since we now have an rtt measurement, cancel the
1217 * timer backoff (cf., Phil Karn's retransmit alg.).
1218 * Recompute the initial retransmit timer.
1219 */
1220#if 0
1221 if (ts_present)
1222 tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1223 else
1224#endif
1225 if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1226 tcp_xmit_timer(pData, tp,tp->t_rtt);
1227
1228 /*
1229 * If all outstanding data is acked, stop retransmit
1230 * timer and remember to restart (more output or persist).
1231 * If there is more data to be acked, restart retransmit
1232 * timer, using current (possibly backed-off) value.
1233 */
1234 if (ti->ti_ack == tp->snd_max)
1235 {
1236 tp->t_timer[TCPT_REXMT] = 0;
1237 needoutput = 1;
1238 }
1239 else if (tp->t_timer[TCPT_PERSIST] == 0)
1240 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1241 /*
1242 * When new data is acked, open the congestion window.
1243 * If the window gives us less than ssthresh packets
1244 * in flight, open exponentially (maxseg per packet).
1245 * Otherwise open linearly: maxseg per window
1246 * (maxseg^2 / cwnd per packet).
1247 */
1248 {
1249 register u_int cw = tp->snd_cwnd;
1250 register u_int incr = tp->t_maxseg;
1251
1252 if (cw > tp->snd_ssthresh)
1253 incr = incr * incr / cw;
1254 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1255 }
1256 if (acked > so->so_snd.sb_cc)
1257 {
1258 tp->snd_wnd -= so->so_snd.sb_cc;
1259 sbdrop(&so->so_snd, (int )so->so_snd.sb_cc);
1260 ourfinisacked = 1;
1261 }
1262 else
1263 {
1264 sbdrop(&so->so_snd, acked);
1265 tp->snd_wnd -= acked;
1266 ourfinisacked = 0;
1267 }
1268 /*
1269 * XXX sowwakup is called when data is acked and there's room for
1270 * for more data... it should read() the socket
1271 */
1272#if 0
1273 if (so->so_snd.sb_flags & SB_NOTIFY)
1274 sowwakeup(so);
1275#endif
1276 tp->snd_una = ti->ti_ack;
1277 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1278 tp->snd_nxt = tp->snd_una;
1279
1280 switch (tp->t_state)
1281 {
1282 /*
1283 * In FIN_WAIT_1 STATE in addition to the processing
1284 * for the ESTABLISHED state if our FIN is now acknowledged
1285 * then enter FIN_WAIT_2.
1286 */
1287 case TCPS_FIN_WAIT_1:
1288 if (ourfinisacked)
1289 {
1290 /*
1291 * If we can't receive any more
1292 * data, then closing user can proceed.
1293 * Starting the timer is contrary to the
1294 * specification, but if we don't get a FIN
1295 * we'll hang forever.
1296 */
1297 if (so->so_state & SS_FCANTRCVMORE)
1298 {
1299 soisfdisconnected(so);
1300 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1301 }
1302 tp->t_state = TCPS_FIN_WAIT_2;
1303 }
1304 break;
1305
1306 /*
1307 * In CLOSING STATE in addition to the processing for
1308 * the ESTABLISHED state if the ACK acknowledges our FIN
1309 * then enter the TIME-WAIT state, otherwise ignore
1310 * the segment.
1311 */
1312 case TCPS_CLOSING:
1313 if (ourfinisacked)
1314 {
1315 tp->t_state = TCPS_TIME_WAIT;
1316 tcp_canceltimers(tp);
1317 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1318 soisfdisconnected(so);
1319 }
1320 break;
1321
1322 /*
1323 * In LAST_ACK, we may still be waiting for data to drain
1324 * and/or to be acked, as well as for the ack of our FIN.
1325 * If our FIN is now acknowledged, delete the TCB,
1326 * enter the closed state and return.
1327 */
1328 case TCPS_LAST_ACK:
1329 if (ourfinisacked)
1330 {
1331 Log2(("ourfinisacked=>drop\n"));
1332 tp = tcp_close(pData, tp);
1333 goto drop;
1334 }
1335 break;
1336
1337 /*
1338 * In TIME_WAIT state the only thing that should arrive
1339 * is a retransmission of the remote FIN. Acknowledge
1340 * it and restart the finack timer.
1341 */
1342 case TCPS_TIME_WAIT:
1343 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1344 goto dropafterack;
1345 }
1346 } /* switch(tp->t_state) */
1347
1348step6:
1349 /*
1350 * Update window information.
1351 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1352 */
1353 if ( (tiflags & TH_ACK)
1354 && ( SEQ_LT(tp->snd_wl1, ti->ti_seq)
1355 || ( tp->snd_wl1 == ti->ti_seq
1356 && ( SEQ_LT(tp->snd_wl2, ti->ti_ack)
1357 || ( tp->snd_wl2 == ti->ti_ack
1358 && tiwin > tp->snd_wnd)))))
1359 {
1360 /* keep track of pure window updates */
1361 if ( ti->ti_len == 0
1362 && tp->snd_wl2 == ti->ti_ack
1363 && tiwin > tp->snd_wnd)
1364 tcpstat.tcps_rcvwinupd++;
1365 tp->snd_wnd = tiwin;
1366 tp->snd_wl1 = ti->ti_seq;
1367 tp->snd_wl2 = ti->ti_ack;
1368 if (tp->snd_wnd > tp->max_sndwnd)
1369 tp->max_sndwnd = tp->snd_wnd;
1370 needoutput = 1;
1371 }
1372
1373 /*
1374 * Process segments with URG.
1375 */
1376 if ((tiflags & TH_URG) && ti->ti_urp &&
1377 TCPS_HAVERCVDFIN(tp->t_state) == 0)
1378 {
1379 /*
1380 * This is a kludge, but if we receive and accept
1381 * random urgent pointers, we'll crash in
1382 * soreceive. It's hard to imagine someone
1383 * actually wanting to send this much urgent data.
1384 */
1385 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen)
1386 {
1387 ti->ti_urp = 0;
1388 tiflags &= ~TH_URG;
1389 goto dodata;
1390 }
1391 /*
1392 * If this segment advances the known urgent pointer,
1393 * then mark the data stream. This should not happen
1394 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1395 * a FIN has been received from the remote side.
1396 * In these states we ignore the URG.
1397 *
1398 * According to RFC961 (Assigned Protocols),
1399 * the urgent pointer points to the last octet
1400 * of urgent data. We continue, however,
1401 * to consider it to indicate the first octet
1402 * of data past the urgent section as the original
1403 * spec states (in one of two places).
1404 */
1405 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up))
1406 {
1407 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1408 so->so_urgc = so->so_rcv.sb_cc +
1409 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1410 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1411 }
1412 }
1413 else
1414 /*
1415 * If no out of band data is expected,
1416 * pull receive urgent pointer along
1417 * with the receive window.
1418 */
1419 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1420 tp->rcv_up = tp->rcv_nxt;
1421dodata:
1422
1423 /*
1424 * If this is a small packet, then ACK now - with Nagel
1425 * congestion avoidance sender won't send more until
1426 * he gets an ACK.
1427 *
1428 * See above.
1429 */
1430 if ( ti->ti_len
1431 && (unsigned)ti->ti_len <= 5
1432 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27)
1433 {
1434 tp->t_flags |= TF_ACKNOW;
1435 }
1436
1437 /*
1438 * Process the segment text, merging it into the TCP sequencing queue,
1439 * and arranging for acknowledgment of receipt if necessary.
1440 * This process logically involves adjusting tp->rcv_wnd as data
1441 * is presented to the user (this happens in tcp_usrreq.c,
1442 * case PRU_RCVD). If a FIN has already been received on this
1443 * connection then we just ignore the text.
1444 */
1445 if ( (ti->ti_len || (tiflags&TH_FIN))
1446 && TCPS_HAVERCVDFIN(tp->t_state) == 0)
1447 {
1448 if ( ti->ti_seq == tp->rcv_nxt
1449 && LIST_EMPTY(&tp->t_segq)
1450 && tp->t_state == TCPS_ESTABLISHED)
1451 {
1452 DELAY_ACK(tp, ti); /* little bit different from BSD declaration see netinet/tcp_input.c */
1453 tp->rcv_nxt += tlen;
1454 tiflags = ti->ti_t.th_flags & TH_FIN;
1455 tcpstat.tcps_rcvpack++;
1456 tcpstat.tcps_rcvbyte += tlen;
1457 if (so->so_state & SS_FCANTRCVMORE)
1458 m_freem(pData, m);
1459 else
1460 sbappend(pData, so, m);
1461 }
1462 else
1463 {
1464 tiflags = tcp_reass(pData, tp, &ti->ti_t, &tlen, m);
1465 tiflags |= TF_ACKNOW;
1466 }
1467 /*
1468 * Note the amount of data that peer has sent into
1469 * our window, in order to estimate the sender's
1470 * buffer size.
1471 */
1472 len = so->so_rcv.sb_datalen - (tp->rcv_adv - tp->rcv_nxt);
1473 }
1474 else
1475 {
1476 m_free(pData, m);
1477 tiflags &= ~TH_FIN;
1478 }
1479
1480 /*
1481 * If FIN is received ACK the FIN and let the user know
1482 * that the connection is closing.
1483 */
1484 if (tiflags & TH_FIN)
1485 {
1486 if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
1487 {
1488 /*
1489 * If we receive a FIN we can't send more data,
1490 * set it SS_FDRAIN
1491 * Shutdown the socket if there is no rx data in the
1492 * buffer.
1493 * soread() is called on completion of shutdown() and
1494 * will got to TCPS_LAST_ACK, and use tcp_output()
1495 * to send the FIN.
1496 */
1497/* sofcantrcvmore(so); */
1498 sofwdrain(so);
1499
1500 tp->t_flags |= TF_ACKNOW;
1501 tp->rcv_nxt++;
1502 }
1503 switch (tp->t_state)
1504 {
1505 /*
1506 * In SYN_RECEIVED and ESTABLISHED STATES
1507 * enter the CLOSE_WAIT state.
1508 */
1509 case TCPS_SYN_RECEIVED:
1510 case TCPS_ESTABLISHED:
1511 if(so->so_emu == EMU_CTL) /* no shutdown on socket */
1512 tp->t_state = TCPS_LAST_ACK;
1513 else
1514 tp->t_state = TCPS_CLOSE_WAIT;
1515 break;
1516
1517 /*
1518 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1519 * enter the CLOSING state.
1520 */
1521 case TCPS_FIN_WAIT_1:
1522 tp->t_state = TCPS_CLOSING;
1523 break;
1524
1525 /*
1526 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1527 * starting the time-wait timer, turning off the other
1528 * standard timers.
1529 */
1530 case TCPS_FIN_WAIT_2:
1531 tp->t_state = TCPS_TIME_WAIT;
1532 tcp_canceltimers(tp);
1533 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1534 soisfdisconnected(so);
1535 break;
1536
1537 /*
1538 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1539 */
1540 case TCPS_TIME_WAIT:
1541 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1542 break;
1543 }
1544 }
1545
1546 /*
1547 * Return any desired output.
1548 */
1549 if (needoutput || (tp->t_flags & TF_ACKNOW))
1550 tcp_output(pData, tp);
1551
1552 return;
1553
1554dropafterack:
1555 Log2(("drop after ack\n"));
1556 /*
1557 * Generate an ACK dropping incoming segment if it occupies
1558 * sequence space, where the ACK reflects our state.
1559 */
1560 if (tiflags & TH_RST)
1561 goto drop;
1562 m_freem(pData, m);
1563 tp->t_flags |= TF_ACKNOW;
1564 (void) tcp_output(pData, tp);
1565 return;
1566
1567dropwithreset:
1568 /* reuses m if m!=NULL, m_free() unnecessary */
1569 Log2(("drop with reset\n"));
1570 if (tiflags & TH_ACK)
1571 tcp_respond(pData, tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1572 else
1573 {
1574 if (tiflags & TH_SYN) ti->ti_len++;
1575 tcp_respond(pData, tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1576 TH_RST|TH_ACK);
1577 }
1578
1579 return;
1580
1581drop:
1582 /*
1583 * Drop space held by incoming segment and return.
1584 */
1585 Log2(("drop\n"));
1586 m_free(pData, m);
1587
1588 return;
1589}
1590
1591void
1592tcp_dooptions(PNATState pData, struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1593{
1594 u_int16_t mss;
1595 int opt, optlen;
1596
1597 DEBUG_CALL("tcp_dooptions");
1598 DEBUG_ARGS((dfd," tp = %lx cnt=%i \n", (long )tp, cnt));
1599
1600 for (; cnt > 0; cnt -= optlen, cp += optlen)
1601 {
1602 opt = cp[0];
1603 if (opt == TCPOPT_EOL)
1604 break;
1605 if (opt == TCPOPT_NOP)
1606 optlen = 1;
1607 else
1608 {
1609 optlen = cp[1];
1610 if (optlen <= 0)
1611 break;
1612 }
1613 switch (opt)
1614 {
1615 default:
1616 continue;
1617
1618 case TCPOPT_MAXSEG:
1619 if (optlen != TCPOLEN_MAXSEG)
1620 continue;
1621 if (!(ti->ti_flags & TH_SYN))
1622 continue;
1623 memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1624 NTOHS(mss);
1625 (void) tcp_mss(pData, tp, mss); /* sets t_maxseg */
1626 break;
1627
1628#if 0
1629 case TCPOPT_WINDOW:
1630 if (optlen != TCPOLEN_WINDOW)
1631 continue;
1632 if (!(ti->ti_flags & TH_SYN))
1633 continue;
1634 tp->t_flags |= TF_RCVD_SCALE;
1635 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1636 break;
1637
1638 case TCPOPT_TIMESTAMP:
1639 if (optlen != TCPOLEN_TIMESTAMP)
1640 continue;
1641 *ts_present = 1;
1642 memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1643 NTOHL(*ts_val);
1644 memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1645 NTOHL(*ts_ecr);
1646
1647 /*
1648 * A timestamp received in a SYN makes
1649 * it ok to send timestamp requests and replies.
1650 */
1651 if (ti->ti_flags & TH_SYN)
1652 {
1653 tp->t_flags |= TF_RCVD_TSTMP;
1654 tp->ts_recent = *ts_val;
1655 tp->ts_recent_age = tcp_now;
1656 }
1657 break;
1658#endif
1659 }
1660 }
1661}
1662
1663
1664/*
1665 * Pull out of band byte out of a segment so
1666 * it doesn't appear in the user's data queue.
1667 * It is still reflected in the segment length for
1668 * sequencing purposes.
1669 */
1670
1671#if 0
1672void
1673tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m)
1674{
1675 int cnt = ti->ti_urp - 1;
1676
1677 while (cnt >= 0)
1678 {
1679 if (m->m_len > cnt)
1680 {
1681 char *cp = mtod(m, caddr_t) + cnt;
1682 struct tcpcb *tp = sototcpcb(so);
1683
1684 tp->t_iobc = *cp;
1685 tp->t_oobflags |= TCPOOB_HAVEDATA;
1686 memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1687 m->m_len--;
1688 return;
1689 }
1690 cnt -= m->m_len;
1691 m = m->m_next; /* XXX WRONG! Fix it! */
1692 if (m == 0)
1693 break;
1694 }
1695 panic("tcp_pulloutofband");
1696}
1697#endif
1698
1699/*
1700 * Collect new round-trip time estimate
1701 * and update averages and current timeout.
1702 */
1703
1704void
1705tcp_xmit_timer(PNATState pData, register struct tcpcb *tp, int rtt)
1706{
1707 register short delta;
1708
1709 DEBUG_CALL("tcp_xmit_timer");
1710 DEBUG_ARG("tp = %lx", (long)tp);
1711 DEBUG_ARG("rtt = %d", rtt);
1712
1713 tcpstat.tcps_rttupdated++;
1714 if (tp->t_srtt != 0)
1715 {
1716 /*
1717 * srtt is stored as fixed point with 3 bits after the
1718 * binary point (i.e., scaled by 8). The following magic
1719 * is equivalent to the smoothing algorithm in rfc793 with
1720 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1721 * point). Adjust rtt to origin 0.
1722 */
1723 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1724 if ((tp->t_srtt += delta) <= 0)
1725 tp->t_srtt = 1;
1726 /*
1727 * We accumulate a smoothed rtt variance (actually, a
1728 * smoothed mean difference), then set the retransmit
1729 * timer to smoothed rtt + 4 times the smoothed variance.
1730 * rttvar is stored as fixed point with 2 bits after the
1731 * binary point (scaled by 4). The following is
1732 * equivalent to rfc793 smoothing with an alpha of .75
1733 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1734 * rfc793's wired-in beta.
1735 */
1736 if (delta < 0)
1737 delta = -delta;
1738 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1739 if ((tp->t_rttvar += delta) <= 0)
1740 tp->t_rttvar = 1;
1741 }
1742 else
1743 {
1744 /*
1745 * No rtt measurement yet - use the unsmoothed rtt.
1746 * Set the variance to half the rtt (so our first
1747 * retransmit happens at 3*rtt).
1748 */
1749 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1750 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1751 }
1752 tp->t_rtt = 0;
1753 tp->t_rxtshift = 0;
1754
1755 /*
1756 * the retransmit should happen at rtt + 4 * rttvar.
1757 * Because of the way we do the smoothing, srtt and rttvar
1758 * will each average +1/2 tick of bias. When we compute
1759 * the retransmit timer, we want 1/2 tick of rounding and
1760 * 1 extra tick because of +-1/2 tick uncertainty in the
1761 * firing of the timer. The bias will give us exactly the
1762 * 1.5 tick we need. But, because the bias is
1763 * statistical, we have to test that we don't drop below
1764 * the minimum feasible timer (which is 2 ticks).
1765 */
1766 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1767 (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
1768
1769 /*
1770 * We received an ack for a packet that wasn't retransmitted;
1771 * it is probably safe to discard any error indications we've
1772 * received recently. This isn't quite right, but close enough
1773 * for now (a route might have failed after we sent a segment,
1774 * and the return path might not be symmetrical).
1775 */
1776 tp->t_softerror = 0;
1777}
1778
1779/*
1780 * Determine a reasonable value for maxseg size.
1781 * If the route is known, check route for mtu.
1782 * If none, use an mss that can be handled on the outgoing
1783 * interface without forcing IP to fragment; if bigger than
1784 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
1785 * to utilize large mbufs. If no route is found, route has no mtu,
1786 * or the destination isn't local, use a default, hopefully conservative
1787 * size (usually 512 or the default IP max size, but no more than the mtu
1788 * of the interface), as we can't discover anything about intervening
1789 * gateways or networks. We also initialize the congestion/slow start
1790 * window to be a single segment if the destination isn't local.
1791 * While looking at the routing entry, we also initialize other path-dependent
1792 * parameters from pre-set or cached values in the routing entry.
1793 */
1794
1795int
1796tcp_mss(PNATState pData, register struct tcpcb *tp, u_int offer)
1797{
1798 struct socket *so = tp->t_socket;
1799 int mss;
1800
1801 DEBUG_CALL("tcp_mss");
1802 DEBUG_ARG("tp = %lx", (long)tp);
1803 DEBUG_ARG("offer = %d", offer);
1804
1805 mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
1806 if (offer)
1807 mss = min(mss, offer);
1808 mss = max(mss, 32);
1809 if (mss < tp->t_maxseg || offer != 0)
1810 tp->t_maxseg = mss;
1811
1812 tp->snd_cwnd = mss;
1813
1814 sbreserve(&so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
1815 sbreserve(&so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
1816
1817 DEBUG_MISC((dfd, " returning mss = %d\n", mss));
1818
1819 return mss;
1820}
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette