VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 40285

最後變更 在這個檔案從40285是 40120,由 vboxsync 提交於 13 年 前

NAT: sorecvoob() should regard return value of soread() operation.

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 49.9 KB
 
1/* $Id: socket.c 40120 2012-02-14 07:22:20Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.alldomusa.eu.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#define WANT_SYS_IOCTL_H
28#include <slirp.h>
29#include "ip_icmp.h"
30#include "main.h"
31#ifdef __sun__
32#include <sys/filio.h>
33#endif
34#include <VBox/vmm/pdmdrv.h>
35#if defined (RT_OS_WINDOWS)
36#include <iphlpapi.h>
37#include <icmpapi.h>
38#endif
39
40#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
41/**
42 *
43 */
44struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
45{
46 struct socket *pNewSocket = NULL;
47 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
48 pNewSocket = socreate();
49 if (!pNewSocket)
50 {
51 LogFunc(("Can't create socket\n"));
52 LogFlowFunc(("Leave: NULL\n"));
53 return NULL;
54 }
55 if (fBindSocket)
56 {
57 if (udp_attach(pData, pNewSocket, 0) <= 0)
58 {
59 sofree(pData, pNewSocket);
60 LogFunc(("Can't attach fresh created socket\n"));
61 return NULL;
62 }
63 }
64 else
65 {
66 pNewSocket->so_cloneOf = (struct socket *)pSo;
67 pNewSocket->s = pSo->s;
68 insque(pData, pNewSocket, &udb);
69 }
70 pNewSocket->so_laddr = pSo->so_laddr;
71 pNewSocket->so_lport = pSo->so_lport;
72 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
73 pNewSocket->so_fport = pSo->so_fport;
74 pSo->so_cCloneCounter++;
75 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
76 return pNewSocket;
77}
78
79struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
80{
81 struct socket *pSoClone = NULL;
82 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
83 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
84 {
85 if ( pSoClone->so_cloneOf
86 && pSoClone->so_cloneOf == pcSo
87 && pSoClone->so_lport == pcSo->so_lport
88 && pSoClone->so_fport == pcSo->so_fport
89 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
90 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
91 goto done;
92 }
93 pSoClone = NULL;
94done:
95 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
96 return pSoClone;
97}
98#endif
99
100#ifdef VBOX_WITH_NAT_SEND2HOME
101DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
102{
103 int idxAddr;
104 int ret = 0;
105 bool fSendDone = false;
106 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
107 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
108 {
109
110 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
111 AssertReturn((pNewSocket, false));
112 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
113 /* @todo: more verbose on errors,
114 * @note: we shouldn't care if this send fail or not (we're in broadcast).
115 */
116 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
117 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
118 if (ret < 0)
119 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
120 fSendDone |= ret > 0;
121 }
122 LogFlowFunc(("Leave %RTbool\n", fSendDone));
123 return fSendDone;
124}
125#endif /* !VBOX_WITH_NAT_SEND2HOME */
126static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
127#ifdef RT_OS_WINDOWS
128static void sorecvfrom_icmp_win(PNATState, struct socket *);
129#else /* RT_OS_WINDOWS */
130static void sorecvfrom_icmp_unix(PNATState, struct socket *);
131#endif /* !RT_OS_WINDOWS */
132
133void
134so_init()
135{
136}
137
138struct socket *
139solookup(struct socket *head, struct in_addr laddr,
140 u_int lport, struct in_addr faddr, u_int fport)
141{
142 struct socket *so;
143
144 for (so = head->so_next; so != head; so = so->so_next)
145 {
146 if ( so->so_lport == lport
147 && so->so_laddr.s_addr == laddr.s_addr
148 && so->so_faddr.s_addr == faddr.s_addr
149 && so->so_fport == fport)
150 return so;
151 }
152
153 return (struct socket *)NULL;
154}
155
156/*
157 * Create a new socket, initialise the fields
158 * It is the responsibility of the caller to
159 * insque() it into the correct linked-list
160 */
161struct socket *
162socreate()
163{
164 struct socket *so;
165
166 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
167 if (so)
168 {
169 so->so_state = SS_NOFDREF;
170 so->s = -1;
171#if !defined(RT_OS_WINDOWS)
172 so->so_poll_index = -1;
173#endif
174 }
175 return so;
176}
177
178/*
179 * remque and free a socket, clobber cache
180 * VBOX_WITH_SLIRP_MT: before sofree queue should be locked, because
181 * in sofree we don't know from which queue item beeing removed.
182 */
183void
184sofree(PNATState pData, struct socket *so)
185{
186 if (so == tcp_last_so)
187 tcp_last_so = &tcb;
188 else if (so == udp_last_so)
189 udp_last_so = &udb;
190
191 /* check if mbuf haven't been already freed */
192 if (so->so_m != NULL)
193 m_freem(pData, so->so_m);
194#ifndef VBOX_WITH_SLIRP_MT
195 if (so->so_next && so->so_prev)
196 {
197 remque(pData, so); /* crashes if so is not in a queue */
198 NSOCK_DEC();
199 }
200
201 RTMemFree(so);
202#else
203 so->so_deleted = 1;
204#endif
205}
206
207#ifdef VBOX_WITH_SLIRP_MT
208void
209soread_queue(PNATState pData, struct socket *so, int *ret)
210{
211 *ret = soread(pData, so);
212}
213#endif
214
215/*
216 * Read from so's socket into sb_snd, updating all relevant sbuf fields
217 * NOTE: This will only be called if it is select()ed for reading, so
218 * a read() of 0 (or less) means it's disconnected
219 */
220#ifndef VBOX_WITH_SLIRP_BSD_SBUF
221int
222soread(PNATState pData, struct socket *so)
223{
224 int n, nn, lss, total;
225 struct sbuf *sb = &so->so_snd;
226 size_t len = sb->sb_datalen - sb->sb_cc;
227 struct iovec iov[2];
228 int mss = so->so_tcpcb->t_maxseg;
229
230 STAM_PROFILE_START(&pData->StatIOread, a);
231 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
232 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
233
234 QSOCKET_LOCK(tcb);
235 SOCKET_LOCK(so);
236 QSOCKET_UNLOCK(tcb);
237
238 LogFlow(("soread: so = %R[natsock]\n", so));
239 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
240
241 /*
242 * No need to check if there's enough room to read.
243 * soread wouldn't have been called if there weren't
244 */
245
246 len = sb->sb_datalen - sb->sb_cc;
247
248 iov[0].iov_base = sb->sb_wptr;
249 iov[1].iov_base = 0;
250 iov[1].iov_len = 0;
251 if (sb->sb_wptr < sb->sb_rptr)
252 {
253 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
254 /* Should never succeed, but... */
255 if (iov[0].iov_len > len)
256 iov[0].iov_len = len;
257 if (iov[0].iov_len > mss)
258 iov[0].iov_len -= iov[0].iov_len%mss;
259 n = 1;
260 }
261 else
262 {
263 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
264 /* Should never succeed, but... */
265 if (iov[0].iov_len > len)
266 iov[0].iov_len = len;
267 len -= iov[0].iov_len;
268 if (len)
269 {
270 iov[1].iov_base = sb->sb_data;
271 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
272 if (iov[1].iov_len > len)
273 iov[1].iov_len = len;
274 total = iov[0].iov_len + iov[1].iov_len;
275 if (total > mss)
276 {
277 lss = total % mss;
278 if (iov[1].iov_len > lss)
279 {
280 iov[1].iov_len -= lss;
281 n = 2;
282 }
283 else
284 {
285 lss -= iov[1].iov_len;
286 iov[0].iov_len -= lss;
287 n = 1;
288 }
289 }
290 else
291 n = 2;
292 }
293 else
294 {
295 if (iov[0].iov_len > mss)
296 iov[0].iov_len -= iov[0].iov_len%mss;
297 n = 1;
298 }
299 }
300
301#ifdef HAVE_READV
302 nn = readv(so->s, (struct iovec *)iov, n);
303#else
304 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
305#endif
306 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
307 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
308 if (nn <= 0)
309 {
310 /*
311 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
312 * _could_ mean that the connection is closed. But we will receive an
313 * FD_CLOSE event later if the connection was _really_ closed. With
314 * www.youtube.com I see this very often. Closing the socket too early
315 * would be dangerous.
316 */
317 int status;
318 unsigned long pending = 0;
319 status = ioctlsocket(so->s, FIONREAD, &pending);
320 if (status < 0)
321 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
322 if (nn == 0 && (pending != 0))
323 {
324 SOCKET_UNLOCK(so);
325 STAM_PROFILE_STOP(&pData->StatIOread, a);
326 return 0;
327 }
328 if ( nn < 0
329 && ( errno == EINTR
330 || errno == EAGAIN
331 || errno == EWOULDBLOCK))
332 {
333 SOCKET_UNLOCK(so);
334 STAM_PROFILE_STOP(&pData->StatIOread, a);
335 return 0;
336 }
337 else
338 {
339 /* nn == 0 means peer has performed an orderly shutdown */
340 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
341 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
342 sofcantrcvmore(so);
343 tcp_sockclosed(pData, sototcpcb(so));
344 SOCKET_UNLOCK(so);
345 STAM_PROFILE_STOP(&pData->StatIOread, a);
346 return -1;
347 }
348 }
349 STAM_STATS(
350 if (n == 1)
351 {
352 STAM_COUNTER_INC(&pData->StatIORead_in_1);
353 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
354 }
355 else
356 {
357 STAM_COUNTER_INC(&pData->StatIORead_in_2);
358 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
359 }
360 );
361
362#ifndef HAVE_READV
363 /*
364 * If there was no error, try and read the second time round
365 * We read again if n = 2 (ie, there's another part of the buffer)
366 * and we read as much as we could in the first read
367 * We don't test for <= 0 this time, because there legitimately
368 * might not be any more data (since the socket is non-blocking),
369 * a close will be detected on next iteration.
370 * A return of -1 wont (shouldn't) happen, since it didn't happen above
371 */
372 if (n == 2 && nn == iov[0].iov_len)
373 {
374 int ret;
375 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
376 if (ret > 0)
377 nn += ret;
378 STAM_STATS(
379 if (ret > 0)
380 {
381 STAM_COUNTER_INC(&pData->StatIORead_in_2);
382 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
383 }
384 );
385 }
386
387 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
388#endif
389
390 /* Update fields */
391 sb->sb_cc += nn;
392 sb->sb_wptr += nn;
393 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
394 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
395 {
396 sb->sb_wptr -= sb->sb_datalen;
397 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
398 }
399 STAM_PROFILE_STOP(&pData->StatIOread, a);
400 SOCKET_UNLOCK(so);
401 return nn;
402}
403#else /* VBOX_WITH_SLIRP_BSD_SBUF */
404int
405soread(PNATState pData, struct socket *so)
406{
407 int n;
408 char *buf;
409 struct sbuf *sb = &so->so_snd;
410 size_t len = sbspace(sb);
411 int mss = so->so_tcpcb->t_maxseg;
412
413 STAM_PROFILE_START(&pData->StatIOread, a);
414 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
415 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
416
417 QSOCKET_LOCK(tcb);
418 SOCKET_LOCK(so);
419 QSOCKET_UNLOCK(tcb);
420
421 LogFlowFunc(("soread: so = %lx\n", (long)so));
422
423 if (len > mss)
424 len -= len % mss;
425 buf = RTMemAlloc(len);
426 if (buf == NULL)
427 {
428 Log(("NAT: can't alloc enough memory\n"));
429 return -1;
430 }
431
432 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
433 if (n <= 0)
434 {
435 /*
436 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
437 * _could_ mean that the connection is closed. But we will receive an
438 * FD_CLOSE event later if the connection was _really_ closed. With
439 * www.youtube.com I see this very often. Closing the socket too early
440 * would be dangerous.
441 */
442 int status;
443 unsigned long pending = 0;
444 status = ioctlsocket(so->s, FIONREAD, &pending);
445 if (status < 0)
446 Log(("NAT:error in WSAIoctl: %d\n", errno));
447 if (n == 0 && (pending != 0))
448 {
449 SOCKET_UNLOCK(so);
450 STAM_PROFILE_STOP(&pData->StatIOread, a);
451 RTMemFree(buf);
452 return 0;
453 }
454 if ( n < 0
455 && ( errno == EINTR
456 || errno == EAGAIN
457 || errno == EWOULDBLOCK))
458 {
459 SOCKET_UNLOCK(so);
460 STAM_PROFILE_STOP(&pData->StatIOread, a);
461 RTMemFree(buf);
462 return 0;
463 }
464 else
465 {
466 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
467 n, errno, strerror(errno)));
468 sofcantrcvmore(so);
469 tcp_sockclosed(pData, sototcpcb(so));
470 SOCKET_UNLOCK(so);
471 STAM_PROFILE_STOP(&pData->StatIOread, a);
472 RTMemFree(buf);
473 return -1;
474 }
475 }
476
477 sbuf_bcat(sb, buf, n);
478 RTMemFree(buf);
479 return n;
480}
481#endif
482
483/*
484 * Get urgent data
485 *
486 * When the socket is created, we set it SO_OOBINLINE,
487 * so when OOB data arrives, we soread() it and everything
488 * in the send buffer is sent as urgent data
489 */
490void
491sorecvoob(PNATState pData, struct socket *so)
492{
493 struct tcpcb *tp = sototcpcb(so);
494 ssize_t ret;
495
496 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
497
498 /*
499 * We take a guess at how much urgent data has arrived.
500 * In most situations, when urgent data arrives, the next
501 * read() should get all the urgent data. This guess will
502 * be wrong however if more data arrives just after the
503 * urgent data, or the read() doesn't return all the
504 * urgent data.
505 */
506 ret = soread(pData, so);
507 if (RT_LIKELY(ret > 0))
508 {
509 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
510 tp->t_force = 1;
511 tcp_output(pData, tp);
512 tp->t_force = 0;
513 }
514}
515#ifndef VBOX_WITH_SLIRP_BSD_SBUF
516/*
517 * Send urgent data
518 * There's a lot duplicated code here, but...
519 */
520int
521sosendoob(struct socket *so)
522{
523 struct sbuf *sb = &so->so_rcv;
524 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
525
526 int n, len;
527
528 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
529
530 if (so->so_urgc > sizeof(buff))
531 so->so_urgc = sizeof(buff); /* XXX */
532
533 if (sb->sb_rptr < sb->sb_wptr)
534 {
535 /* We can send it directly */
536 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
537 so->so_urgc -= n;
538
539 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
540 n, so->so_urgc));
541 }
542 else
543 {
544 /*
545 * Since there's no sendv or sendtov like writev,
546 * we must copy all data to a linear buffer then
547 * send it all
548 */
549 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
550 if (len > so->so_urgc)
551 len = so->so_urgc;
552 memcpy(buff, sb->sb_rptr, len);
553 so->so_urgc -= len;
554 if (so->so_urgc)
555 {
556 n = sb->sb_wptr - sb->sb_data;
557 if (n > so->so_urgc)
558 n = so->so_urgc;
559 memcpy(buff + len, sb->sb_data, n);
560 so->so_urgc -= n;
561 len += n;
562 }
563 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
564#ifdef DEBUG
565 if (n != len)
566 Log(("Didn't send all data urgently XXXXX\n"));
567#endif
568 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
569 n, so->so_urgc));
570 }
571
572 sb->sb_cc -= n;
573 sb->sb_rptr += n;
574 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
575 sb->sb_rptr -= sb->sb_datalen;
576
577 return n;
578}
579
580/*
581 * Write data from so_rcv to so's socket,
582 * updating all sbuf field as necessary
583 */
584int
585sowrite(PNATState pData, struct socket *so)
586{
587 int n, nn;
588 struct sbuf *sb = &so->so_rcv;
589 size_t len = sb->sb_cc;
590 struct iovec iov[2];
591
592 STAM_PROFILE_START(&pData->StatIOwrite, a);
593 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
594 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
595 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
596 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
597 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
598 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
599 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
600 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
601 LogFlowFunc(("so = %R[natsock]\n", so));
602 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
603 QSOCKET_LOCK(tcb);
604 SOCKET_LOCK(so);
605 QSOCKET_UNLOCK(tcb);
606 if (so->so_urgc)
607 {
608 sosendoob(so);
609 if (sb->sb_cc == 0)
610 {
611 SOCKET_UNLOCK(so);
612 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
613 return 0;
614 }
615 }
616
617 /*
618 * No need to check if there's something to write,
619 * sowrite wouldn't have been called otherwise
620 */
621
622 len = sb->sb_cc;
623
624 iov[0].iov_base = sb->sb_rptr;
625 iov[1].iov_base = 0;
626 iov[1].iov_len = 0;
627 if (sb->sb_rptr < sb->sb_wptr)
628 {
629 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
630 /* Should never succeed, but... */
631 if (iov[0].iov_len > len)
632 iov[0].iov_len = len;
633 n = 1;
634 }
635 else
636 {
637 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
638 if (iov[0].iov_len > len)
639 iov[0].iov_len = len;
640 len -= iov[0].iov_len;
641 if (len)
642 {
643 iov[1].iov_base = sb->sb_data;
644 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
645 if (iov[1].iov_len > len)
646 iov[1].iov_len = len;
647 n = 2;
648 }
649 else
650 n = 1;
651 }
652 STAM_STATS({
653 if (n == 1)
654 {
655 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
656 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
657 }
658 else
659 {
660 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
661 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
662 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
663 }
664 });
665 /* Check if there's urgent data to send, and if so, send it */
666#ifdef HAVE_READV
667 nn = writev(so->s, (const struct iovec *)iov, n);
668#else
669 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
670#endif
671 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
672 /* This should never happen, but people tell me it does *shrug* */
673 if ( nn < 0
674 && ( errno == EAGAIN
675 || errno == EINTR
676 || errno == EWOULDBLOCK))
677 {
678 SOCKET_UNLOCK(so);
679 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
680 return 0;
681 }
682
683 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
684 {
685 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
686 __PRETTY_FUNCTION__, so->so_state, errno));
687 sofcantsendmore(so);
688 tcp_sockclosed(pData, sototcpcb(so));
689 SOCKET_UNLOCK(so);
690 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
691 return -1;
692 }
693
694#ifndef HAVE_READV
695 if (n == 2 && nn == iov[0].iov_len)
696 {
697 int ret;
698 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
699 if (ret > 0)
700 nn += ret;
701 STAM_STATS({
702 if (ret > 0 && ret != iov[1].iov_len)
703 {
704 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
705 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
706 }
707 });
708 }
709 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
710#endif
711
712 /* Update sbuf */
713 sb->sb_cc -= nn;
714 sb->sb_rptr += nn;
715 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
716 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
717 {
718 sb->sb_rptr -= sb->sb_datalen;
719 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
720 }
721
722 /*
723 * If in DRAIN mode, and there's no more data, set
724 * it CANTSENDMORE
725 */
726 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
727 sofcantsendmore(so);
728
729 SOCKET_UNLOCK(so);
730 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
731 return nn;
732}
733#else /* VBOX_WITH_SLIRP_BSD_SBUF */
734static int
735do_sosend(struct socket *so, int fUrg)
736{
737 struct sbuf *sb = &so->so_rcv;
738
739 int n, len;
740
741 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
742
743 len = sbuf_len(sb);
744
745 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
746 if (n < 0)
747 Log(("NAT: Can't sent sbuf via socket.\n"));
748 if (fUrg)
749 so->so_urgc -= n;
750 if (n > 0 && n < len)
751 {
752 char *ptr;
753 char *buff;
754 buff = RTMemAlloc(len);
755 if (buff == NULL)
756 {
757 Log(("NAT: No space to allocate temporal buffer\n"));
758 return -1;
759 }
760 ptr = sbuf_data(sb);
761 memcpy(buff, &ptr[n], len - n);
762 sbuf_bcpy(sb, buff, len - n);
763 RTMemFree(buff);
764 return n;
765 }
766 sbuf_clear(sb);
767 return n;
768}
769int
770sosendoob(struct socket *so)
771{
772 return do_sosend(so, 1);
773}
774
775/*
776 * Write data from so_rcv to so's socket,
777 * updating all sbuf field as necessary
778 */
779int
780sowrite(PNATState pData, struct socket *so)
781{
782 return do_sosend(so, 0);
783}
784#endif
785
786/*
787 * recvfrom() a UDP socket
788 */
789void
790sorecvfrom(PNATState pData, struct socket *so)
791{
792 ssize_t ret = 0;
793 struct sockaddr_in addr;
794 socklen_t addrlen = sizeof(struct sockaddr_in);
795
796 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
797
798 if (so->so_type == IPPROTO_ICMP)
799 {
800 /* This is a "ping" reply */
801#ifdef RT_OS_WINDOWS
802 sorecvfrom_icmp_win(pData, so);
803#else /* RT_OS_WINDOWS */
804 sorecvfrom_icmp_unix(pData, so);
805#endif /* !RT_OS_WINDOWS */
806 udp_detach(pData, so);
807 }
808 else
809 {
810 /* A "normal" UDP packet */
811 struct mbuf *m;
812 ssize_t len;
813 u_long n = 0;
814 int rc = 0;
815 static int signalled = 0;
816 char *pchBuffer = NULL;
817 bool fWithTemporalBuffer = false;
818
819 QSOCKET_LOCK(udb);
820 SOCKET_LOCK(so);
821 QSOCKET_UNLOCK(udb);
822
823 /*How many data has been received ?*/
824 /*
825 * 1. calculate how much we can read
826 * 2. read as much as possible
827 * 3. attach buffer to allocated header mbuf
828 */
829 rc = ioctlsocket(so->s, FIONREAD, &n);
830 if (rc == -1)
831 {
832 if ( errno == EAGAIN
833 || errno == EWOULDBLOCK
834 || errno == EINPROGRESS
835 || errno == ENOTCONN)
836 return;
837 else if (signalled == 0)
838 {
839 LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
840 signalled = 1;
841 }
842 return;
843 }
844
845 len = sizeof(struct udpiphdr);
846 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
847 if (m == NULL)
848 return;
849
850 len += n;
851 m->m_data += ETH_HLEN;
852 m->m_pkthdr.header = mtod(m, void *);
853 m->m_data += sizeof(struct udpiphdr);
854
855 pchBuffer = mtod(m, char *);
856 fWithTemporalBuffer = false;
857 /*
858 * Even if amounts of bytes on socket is greater than MTU value
859 * Slirp will able fragment it, but we won't create temporal location
860 * here.
861 */
862 if (n > (slirp_size(pData) - sizeof(struct udpiphdr)))
863 {
864 pchBuffer = RTMemAlloc((n) * sizeof(char));
865 if (!pchBuffer)
866 {
867 m_freem(pData, m);
868 return;
869 }
870 fWithTemporalBuffer = true;
871 }
872 ret = recvfrom(so->s, pchBuffer, n, 0,
873 (struct sockaddr *)&addr, &addrlen);
874 if (fWithTemporalBuffer)
875 {
876 if (ret > 0)
877 {
878 m_copyback(pData, m, 0, ret, pchBuffer);
879 /*
880 * If we've met comporison below our size prediction was failed
881 * it's not fatal just we've allocated for nothing. (@todo add counter here
882 * to calculate how rare we here)
883 */
884 if(ret < slirp_size(pData) && !m->m_next)
885 Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
886 n, ret, slirp_size(pData)));
887 }
888 /* we're freeing buffer anyway */
889 RTMemFree(pchBuffer);
890 }
891 else
892 m->m_len = ret;
893
894 if (ret < 0)
895 {
896 u_char code = ICMP_UNREACH_PORT;
897
898 if (errno == EHOSTUNREACH)
899 code = ICMP_UNREACH_HOST;
900 else if (errno == ENETUNREACH)
901 code = ICMP_UNREACH_NET;
902
903 m_freem(pData, m);
904 if ( errno == EAGAIN
905 || errno == EWOULDBLOCK
906 || errno == EINPROGRESS
907 || errno == ENOTCONN)
908 {
909 return;
910 }
911
912 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
913 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
914 so->so_m = NULL;
915 }
916 else
917 {
918 Assert((m_length(m,NULL) == ret));
919 /*
920 * Hack: domain name lookup will be used the most for UDP,
921 * and since they'll only be used once there's no need
922 * for the 4 minute (or whatever) timeout... So we time them
923 * out much quicker (10 seconds for now...)
924 */
925 if (so->so_expire)
926 {
927 if (so->so_fport != RT_H2N_U16_C(53))
928 so->so_expire = curtime + SO_EXPIRE;
929 }
930 /*
931 * last argument should be changed if Slirp will inject IP attributes
932 * Note: Here we can't check if dnsproxy's sent initial request
933 */
934 if ( pData->fUseDnsProxy
935 && so->so_fport == RT_H2N_U16_C(53))
936 dnsproxy_answer(pData, so, m);
937
938#if 0
939 if (m->m_len == len)
940 {
941 m_inc(m, MINCSIZE);
942 m->m_len = 0;
943 }
944#endif
945
946 /* packets definetly will be fragmented, could confuse receiver peer. */
947 if (m_length(m, NULL) > if_mtu)
948 m->m_flags |= M_SKIP_FIREWALL;
949 /*
950 * If this packet was destined for CTL_ADDR,
951 * make it look like that's where it came from, done by udp_output
952 */
953 udp_output(pData, so, m, &addr);
954 SOCKET_UNLOCK(so);
955 } /* rx error */
956 } /* if ping packet */
957}
958
959/*
960 * sendto() a socket
961 */
962int
963sosendto(PNATState pData, struct socket *so, struct mbuf *m)
964{
965 int ret;
966 struct sockaddr_in *paddr;
967 struct sockaddr addr;
968#if 0
969 struct sockaddr_in host_addr;
970#endif
971 caddr_t buf = 0;
972 int mlen;
973
974 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
975
976 memset(&addr, 0, sizeof(struct sockaddr));
977#ifdef RT_OS_DARWIN
978 addr.sa_len = sizeof(struct sockaddr_in);
979#endif
980 paddr = (struct sockaddr_in *)&addr;
981 paddr->sin_family = AF_INET;
982 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
983 {
984 /* It's an alias */
985 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
986 switch(last_byte)
987 {
988#if 0
989 /* handle this case at 'default:' */
990 case CTL_BROADCAST:
991 addr.sin_addr.s_addr = INADDR_BROADCAST;
992 /* Send the packet to host to fully emulate broadcast */
993 /** @todo r=klaus: on Linux host this causes the host to receive
994 * the packet twice for some reason. And I cannot find any place
995 * in the man pages which states that sending a broadcast does not
996 * reach the host itself. */
997 host_addr.sin_family = AF_INET;
998 host_addr.sin_port = so->so_fport;
999 host_addr.sin_addr = our_addr;
1000 sendto(so->s, m->m_data, m->m_len, 0,
1001 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
1002 break;
1003#endif
1004 case CTL_DNS:
1005 case CTL_ALIAS:
1006 default:
1007 if (last_byte == ~pData->netmask)
1008 paddr->sin_addr.s_addr = INADDR_BROADCAST;
1009 else
1010 paddr->sin_addr = loopback_addr;
1011 break;
1012 }
1013 }
1014 else
1015 paddr->sin_addr = so->so_faddr;
1016 paddr->sin_port = so->so_fport;
1017
1018 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1019 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1020
1021 /* Don't care what port we get */
1022 /*
1023 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1024 * generates bodyless messages, annoying memmory management system.
1025 */
1026 mlen = m_length(m, NULL);
1027 if (mlen > 0)
1028 {
1029 buf = RTMemAlloc(mlen);
1030 if (buf == NULL)
1031 {
1032 return -1;
1033 }
1034 m_copydata(m, 0, mlen, buf);
1035 }
1036 ret = sendto(so->s, buf, mlen, 0,
1037 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1038#ifdef VBOX_WITH_NAT_SEND2HOME
1039 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1040 {
1041 slirpSend2Home(pData, so, buf, mlen, 0);
1042 }
1043#endif
1044 if (buf)
1045 RTMemFree(buf);
1046 if (ret < 0)
1047 {
1048 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1049 return -1;
1050 }
1051
1052 /*
1053 * Kill the socket if there's no reply in 4 minutes,
1054 * but only if it's an expirable socket
1055 */
1056 if (so->so_expire)
1057 so->so_expire = curtime + SO_EXPIRE;
1058 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1059 return 0;
1060}
1061
1062/*
1063 * XXX This should really be tcp_listen
1064 */
1065struct socket *
1066solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1067{
1068 struct sockaddr_in addr;
1069 struct socket *so;
1070 socklen_t addrlen = sizeof(addr);
1071 int s, opt = 1;
1072 int status;
1073
1074 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1075
1076 if ((so = socreate()) == NULL)
1077 {
1078 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1079 return NULL;
1080 }
1081
1082 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1083 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1084 {
1085 RTMemFree(so);
1086 return NULL;
1087 }
1088
1089 SOCKET_LOCK_CREATE(so);
1090 SOCKET_LOCK(so);
1091 QSOCKET_LOCK(tcb);
1092 insque(pData, so,&tcb);
1093 NSOCK_INC();
1094 QSOCKET_UNLOCK(tcb);
1095
1096 /*
1097 * SS_FACCEPTONCE sockets must time out.
1098 */
1099 if (flags & SS_FACCEPTONCE)
1100 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1101
1102 so->so_state = (SS_FACCEPTCONN|flags);
1103 so->so_lport = lport; /* Kept in network format */
1104 so->so_laddr.s_addr = laddr; /* Ditto */
1105
1106 memset(&addr, 0, sizeof(addr));
1107#ifdef RT_OS_DARWIN
1108 addr.sin_len = sizeof(addr);
1109#endif
1110 addr.sin_family = AF_INET;
1111 addr.sin_addr.s_addr = bind_addr;
1112 addr.sin_port = port;
1113
1114 /**
1115 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1116 * kernel will choose the optimal value for requests queue length.
1117 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1118 */
1119 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1120 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1121 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1122 || (listen(s, pData->soMaxConn) < 0))
1123 {
1124#ifdef RT_OS_WINDOWS
1125 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1126 closesocket(s);
1127 QSOCKET_LOCK(tcb);
1128 sofree(pData, so);
1129 QSOCKET_UNLOCK(tcb);
1130 /* Restore the real errno */
1131 WSASetLastError(tmperrno);
1132#else
1133 int tmperrno = errno; /* Don't clobber the real reason we failed */
1134 close(s);
1135 QSOCKET_LOCK(tcb);
1136 sofree(pData, so);
1137 QSOCKET_UNLOCK(tcb);
1138 /* Restore the real errno */
1139 errno = tmperrno;
1140#endif
1141 return NULL;
1142 }
1143 fd_nonblock(s);
1144 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1145
1146 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1147 so->so_fport = addr.sin_port;
1148 /* set socket buffers */
1149 opt = pData->socket_rcv;
1150 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1151 if (status < 0)
1152 {
1153 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1154 goto no_sockopt;
1155 }
1156 opt = pData->socket_snd;
1157 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1158 if (status < 0)
1159 {
1160 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1161 goto no_sockopt;
1162 }
1163no_sockopt:
1164 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1165 so->so_faddr = alias_addr;
1166 else
1167 so->so_faddr = addr.sin_addr;
1168
1169 so->s = s;
1170 SOCKET_UNLOCK(so);
1171 return so;
1172}
1173
1174/*
1175 * Data is available in so_rcv
1176 * Just write() the data to the socket
1177 * XXX not yet...
1178 * @todo do we really need this function, what it's intended to do?
1179 */
1180void
1181sorwakeup(struct socket *so)
1182{
1183 NOREF(so);
1184#if 0
1185 sowrite(so);
1186 FD_CLR(so->s,&writefds);
1187#endif
1188}
1189
1190/*
1191 * Data has been freed in so_snd
1192 * We have room for a read() if we want to
1193 * For now, don't read, it'll be done in the main loop
1194 */
1195void
1196sowwakeup(struct socket *so)
1197{
1198 NOREF(so);
1199}
1200
1201/*
1202 * Various session state calls
1203 * XXX Should be #define's
1204 * The socket state stuff needs work, these often get call 2 or 3
1205 * times each when only 1 was needed
1206 */
1207void
1208soisfconnecting(struct socket *so)
1209{
1210 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1211 SS_FCANTSENDMORE|SS_FWDRAIN);
1212 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1213}
1214
1215void
1216soisfconnected(struct socket *so)
1217{
1218 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1219 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1220 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1221 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1222}
1223
1224void
1225sofcantrcvmore(struct socket *so)
1226{
1227 if ((so->so_state & SS_NOFDREF) == 0)
1228 {
1229 shutdown(so->s, 0);
1230 }
1231 so->so_state &= ~(SS_ISFCONNECTING);
1232 if (so->so_state & SS_FCANTSENDMORE)
1233 so->so_state = SS_NOFDREF; /* Don't select it */
1234 /* XXX close() here as well? */
1235 else
1236 so->so_state |= SS_FCANTRCVMORE;
1237}
1238
1239void
1240sofcantsendmore(struct socket *so)
1241{
1242 if ((so->so_state & SS_NOFDREF) == 0)
1243 shutdown(so->s, 1); /* send FIN to fhost */
1244
1245 so->so_state &= ~(SS_ISFCONNECTING);
1246 if (so->so_state & SS_FCANTRCVMORE)
1247 so->so_state = SS_NOFDREF; /* as above */
1248 else
1249 so->so_state |= SS_FCANTSENDMORE;
1250}
1251
1252void
1253soisfdisconnected(struct socket *so)
1254{
1255 NOREF(so);
1256#if 0
1257 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1258 close(so->s);
1259 so->so_state = SS_ISFDISCONNECTED;
1260 /*
1261 * XXX Do nothing ... ?
1262 */
1263#endif
1264}
1265
1266/*
1267 * Set write drain mode
1268 * Set CANTSENDMORE once all data has been write()n
1269 */
1270void
1271sofwdrain(struct socket *so)
1272{
1273 if (SBUF_LEN(&so->so_rcv))
1274 so->so_state |= SS_FWDRAIN;
1275 else
1276 sofcantsendmore(so);
1277}
1278
1279static void
1280send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1281{
1282 struct ip *ip;
1283 uint32_t dst, src;
1284 char ip_copy[256];
1285 struct icmp *icp;
1286 int old_ip_len = 0;
1287 int hlen, original_hlen = 0;
1288 struct mbuf *m;
1289 struct icmp_msg *icm;
1290 uint8_t proto;
1291 int type = 0;
1292
1293 ip = (struct ip *)buff;
1294 /* Fix ip->ip_len to contain the total packet length including the header
1295 * in _host_ byte order for all OSes. On Darwin, that value already is in
1296 * host byte order. Solaris and Darwin report only the payload. */
1297#ifndef RT_OS_DARWIN
1298 ip->ip_len = RT_N2H_U16(ip->ip_len);
1299#endif
1300 hlen = (ip->ip_hl << 2);
1301#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1302 ip->ip_len += hlen;
1303#endif
1304 if (ip->ip_len < hlen + ICMP_MINLEN)
1305 {
1306 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1307 return;
1308 }
1309 icp = (struct icmp *)((char *)ip + hlen);
1310
1311 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1312 if ( icp->icmp_type != ICMP_ECHOREPLY
1313 && icp->icmp_type != ICMP_TIMXCEED
1314 && icp->icmp_type != ICMP_UNREACH)
1315 {
1316 return;
1317 }
1318
1319 /*
1320 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1321 * ICMP_ECHOREPLY assuming data 0
1322 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1323 */
1324 if (ip->ip_len < hlen + 8)
1325 {
1326 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1327 return;
1328 }
1329
1330 type = icp->icmp_type;
1331 if ( type == ICMP_TIMXCEED
1332 || type == ICMP_UNREACH)
1333 {
1334 /*
1335 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1336 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1337 */
1338 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1339 {
1340 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1341 return;
1342 }
1343 ip = &icp->icmp_ip;
1344 }
1345
1346 icm = icmp_find_original_mbuf(pData, ip);
1347 if (icm == NULL)
1348 {
1349 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1350 return;
1351 }
1352
1353 m = icm->im_m;
1354 Assert(m != NULL);
1355
1356 src = addr->sin_addr.s_addr;
1357 if (type == ICMP_ECHOREPLY)
1358 {
1359 struct ip *ip0 = mtod(m, struct ip *);
1360 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1361 if (icp0->icmp_type != ICMP_ECHO)
1362 {
1363 Log(("NAT: we haven't found echo for this reply\n"));
1364 return;
1365 }
1366 /*
1367 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1368 * IP header combined by OS network stack, our local copy of IP header contians values
1369 * in host byte order so no byte order conversion is required. IP headers fields are converting
1370 * in ip_output0 routine only.
1371 */
1372 if ( (ip->ip_len - hlen)
1373 != (ip0->ip_len - (ip0->ip_hl << 2)))
1374 {
1375 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1376 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1377 return;
1378 }
1379 }
1380
1381 /* ip points on origianal ip header */
1382 ip = mtod(m, struct ip *);
1383 proto = ip->ip_p;
1384 /* Now ip is pointing on header we've sent from guest */
1385 if ( icp->icmp_type == ICMP_TIMXCEED
1386 || icp->icmp_type == ICMP_UNREACH)
1387 {
1388 old_ip_len = (ip->ip_hl << 2) + 64;
1389 if (old_ip_len > sizeof(ip_copy))
1390 old_ip_len = sizeof(ip_copy);
1391 memcpy(ip_copy, ip, old_ip_len);
1392 }
1393
1394 /* source address from original IP packet*/
1395 dst = ip->ip_src.s_addr;
1396
1397 /* overide ther tail of old packet */
1398 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1399 original_hlen = ip->ip_hl << 2;
1400 /* saves original ip header and options */
1401 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1402 ip->ip_len = m_length(m, NULL);
1403 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1404
1405 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1406 type = icp->icmp_type;
1407 if ( type == ICMP_TIMXCEED
1408 || type == ICMP_UNREACH)
1409 {
1410 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1411 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1412 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1413 }
1414
1415 ip->ip_src.s_addr = src;
1416 ip->ip_dst.s_addr = dst;
1417 icmp_reflect(pData, m);
1418 LIST_REMOVE(icm, im_list);
1419 pData->cIcmpCacheSize--;
1420 /* Don't call m_free here*/
1421
1422 if ( type == ICMP_TIMXCEED
1423 || type == ICMP_UNREACH)
1424 {
1425 icm->im_so->so_m = NULL;
1426 switch (proto)
1427 {
1428 case IPPROTO_UDP:
1429 /*XXX: so->so_m already freed so we shouldn't call sofree */
1430 udp_detach(pData, icm->im_so);
1431 break;
1432 case IPPROTO_TCP:
1433 /*close tcp should be here */
1434 break;
1435 default:
1436 /* do nothing */
1437 break;
1438 }
1439 }
1440 RTMemFree(icm);
1441}
1442
1443#ifdef RT_OS_WINDOWS
1444static void
1445sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1446{
1447 int len;
1448 int i;
1449 struct ip *ip;
1450 struct mbuf *m;
1451 struct icmp *icp;
1452 struct icmp_msg *icm;
1453 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1454 uint32_t src;
1455 ICMP_ECHO_REPLY *icr;
1456 int hlen = 0;
1457 int nbytes = 0;
1458 u_char code = ~0;
1459 int out_len;
1460 int size;
1461
1462 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->szIcmpBuffer);
1463 if (len < 0)
1464 {
1465 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1466 return;
1467 }
1468 if (len == 0)
1469 return; /* no error */
1470
1471 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1472 for (i = 0; i < len; ++i)
1473 {
1474 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1475 i, icr[i].Data, icr[i].DataSize));
1476 switch(icr[i].Status)
1477 {
1478 case IP_DEST_HOST_UNREACHABLE:
1479 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1480 case IP_DEST_NET_UNREACHABLE:
1481 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1482 case IP_DEST_PROT_UNREACHABLE:
1483 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1484 /* UNREACH error inject here */
1485 case IP_DEST_PORT_UNREACHABLE:
1486 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1487 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1488 so->so_m = NULL;
1489 break;
1490 case IP_SUCCESS: /* echo replied */
1491 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1492 size;
1493 size = MCLBYTES;
1494 if (out_len < MSIZE)
1495 size = MCLBYTES;
1496 else if (out_len < MCLBYTES)
1497 size = MCLBYTES;
1498 else if (out_len < MJUM9BYTES)
1499 size = MJUM9BYTES;
1500 else if (out_len < MJUM16BYTES)
1501 size = MJUM16BYTES;
1502 else
1503 AssertMsgFailed(("Unsupported size"));
1504
1505 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1506 LogFunc(("m_getjcl returns m: %p\n", m));
1507 if (m == NULL)
1508 return;
1509 m->m_len = 0;
1510 m->m_data += if_maxlinkhdr;
1511 m->m_pkthdr.header = mtod(m, void *);
1512
1513 ip = mtod(m, struct ip *);
1514 ip->ip_src.s_addr = icr[i].Address;
1515 ip->ip_p = IPPROTO_ICMP;
1516 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1517 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1518 ip->ip_ttl = icr[i].Options.Ttl;
1519
1520 icp = (struct icmp *)&ip[1]; /* no options */
1521 icp->icmp_type = ICMP_ECHOREPLY;
1522 icp->icmp_code = 0;
1523 icp->icmp_id = so->so_icmp_id;
1524 icp->icmp_seq = so->so_icmp_seq;
1525
1526 icm = icmp_find_original_mbuf(pData, ip);
1527 if (icm)
1528 {
1529 /* on this branch we don't need stored variant */
1530 m_freem(pData, icm->im_m);
1531 LIST_REMOVE(icm, im_list);
1532 pData->cIcmpCacheSize--;
1533 RTMemFree(icm);
1534 }
1535
1536
1537 hlen = (ip->ip_hl << 2);
1538 Assert((hlen >= sizeof(struct ip)));
1539
1540 m->m_data += hlen + ICMP_MINLEN;
1541 if (!RT_VALID_PTR(icr[i].Data))
1542 {
1543 m_freem(pData, m);
1544 break;
1545 }
1546 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1547 m->m_data -= hlen + ICMP_MINLEN;
1548 m->m_len += hlen + ICMP_MINLEN;
1549
1550
1551 ip->ip_len = m_length(m, NULL);
1552 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1553
1554 icmp_reflect(pData, m);
1555 break;
1556 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1557
1558 ip_broken = icr[i].Data;
1559 icm = icmp_find_original_mbuf(pData, ip_broken);
1560 if (icm == NULL) {
1561 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1562 return;
1563 }
1564 m = icm->im_m;
1565 ip = mtod(m, struct ip *);
1566 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1567 ip->ip_ttl = icr[i].Options.Ttl;
1568 src = ip->ip_src.s_addr;
1569 ip->ip_dst.s_addr = src;
1570 ip->ip_dst.s_addr = icr[i].Address;
1571
1572 hlen = (ip->ip_hl << 2);
1573 icp = (struct icmp *)((char *)ip + hlen);
1574 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1575
1576 m->m_len = (ip_broken->ip_hl << 2) + 64;
1577 m->m_pkthdr.header = mtod(m, void *);
1578 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1579 icmp_reflect(pData, m);
1580 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1581 LIST_REMOVE(icm, im_list);
1582 pData->cIcmpCacheSize--;
1583 RTMemFree(icm);
1584 break;
1585 default:
1586 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1587 break;
1588 }
1589 }
1590}
1591#else /* !RT_OS_WINDOWS */
1592static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1593{
1594 struct sockaddr_in addr;
1595 socklen_t addrlen = sizeof(struct sockaddr_in);
1596 struct ip ip;
1597 char *buff;
1598 int len = 0;
1599
1600 /* 1- step: read the ip header */
1601 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1602 (struct sockaddr *)&addr, &addrlen);
1603 if ( len < 0
1604 && ( errno == EAGAIN
1605 || errno == EWOULDBLOCK
1606 || errno == EINPROGRESS
1607 || errno == ENOTCONN))
1608 {
1609 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1610 return;
1611 }
1612
1613 if ( len < sizeof(struct ip)
1614 || len < 0
1615 || len == 0)
1616 {
1617 u_char code;
1618 code = ICMP_UNREACH_PORT;
1619
1620 if (errno == EHOSTUNREACH)
1621 code = ICMP_UNREACH_HOST;
1622 else if (errno == ENETUNREACH)
1623 code = ICMP_UNREACH_NET;
1624
1625 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1626 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1627 so->so_m = NULL;
1628 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1629 return;
1630 }
1631 /* basic check of IP header */
1632 if ( ip.ip_v != IPVERSION
1633# ifndef RT_OS_DARWIN
1634 || ip.ip_p != IPPROTO_ICMP
1635# endif
1636 )
1637 {
1638 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1639 return;
1640 }
1641# ifndef RT_OS_DARWIN
1642 /* Darwin reports the IP length already in host byte order. */
1643 ip.ip_len = RT_N2H_U16(ip.ip_len);
1644# endif
1645# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1646 /* Solaris and Darwin report the payload only */
1647 ip.ip_len += (ip.ip_hl << 2);
1648# endif
1649 /* Note: ip->ip_len in host byte order (all OS) */
1650 len = ip.ip_len;
1651 buff = RTMemAlloc(len);
1652 if (buff == NULL)
1653 {
1654 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1655 return;
1656 }
1657 /* 2 - step: we're reading rest of the datagramm to the buffer */
1658 addrlen = sizeof(struct sockaddr_in);
1659 memset(&addr, 0, addrlen);
1660 len = recvfrom(so->s, buff, len, 0,
1661 (struct sockaddr *)&addr, &addrlen);
1662 if ( len < 0
1663 && ( errno == EAGAIN
1664 || errno == EWOULDBLOCK
1665 || errno == EINPROGRESS
1666 || errno == ENOTCONN))
1667 {
1668 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1669 ip.ip_len));
1670 RTMemFree(buff);
1671 return;
1672 }
1673 if ( len < 0
1674 || len == 0)
1675 {
1676 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1677 errno, len, (ip.ip_len - sizeof(struct ip))));
1678 RTMemFree(buff);
1679 return;
1680 }
1681 /* len is modified in 2nd read, when the rest of the datagramm was read */
1682 send_icmp_to_guest(pData, buff, len, &addr);
1683 RTMemFree(buff);
1684}
1685#endif /* !RT_OS_WINDOWS */
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette