VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 40423

最後變更 在這個檔案從40423是 40423,由 vboxsync 提交於 13 年 前

NAT: warnings [-Wunused-macros]

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 49.9 KB
 
1/* $Id: socket.c 40423 2012-03-11 03:22:22Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.alldomusa.eu.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iphlpapi.h>
36#include <icmpapi.h>
37#endif
38
39#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
40/**
41 *
42 */
43struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
44{
45 struct socket *pNewSocket = NULL;
46 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
47 pNewSocket = socreate();
48 if (!pNewSocket)
49 {
50 LogFunc(("Can't create socket\n"));
51 LogFlowFunc(("Leave: NULL\n"));
52 return NULL;
53 }
54 if (fBindSocket)
55 {
56 if (udp_attach(pData, pNewSocket, 0) <= 0)
57 {
58 sofree(pData, pNewSocket);
59 LogFunc(("Can't attach fresh created socket\n"));
60 return NULL;
61 }
62 }
63 else
64 {
65 pNewSocket->so_cloneOf = (struct socket *)pSo;
66 pNewSocket->s = pSo->s;
67 insque(pData, pNewSocket, &udb);
68 }
69 pNewSocket->so_laddr = pSo->so_laddr;
70 pNewSocket->so_lport = pSo->so_lport;
71 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
72 pNewSocket->so_fport = pSo->so_fport;
73 pSo->so_cCloneCounter++;
74 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
75 return pNewSocket;
76}
77
78struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
79{
80 struct socket *pSoClone = NULL;
81 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
82 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
83 {
84 if ( pSoClone->so_cloneOf
85 && pSoClone->so_cloneOf == pcSo
86 && pSoClone->so_lport == pcSo->so_lport
87 && pSoClone->so_fport == pcSo->so_fport
88 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
89 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
90 goto done;
91 }
92 pSoClone = NULL;
93done:
94 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
95 return pSoClone;
96}
97#endif
98
99#ifdef VBOX_WITH_NAT_SEND2HOME
100DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
101{
102 int idxAddr;
103 int ret = 0;
104 bool fSendDone = false;
105 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
106 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
107 {
108
109 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
110 AssertReturn((pNewSocket, false));
111 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
112 /* @todo: more verbose on errors,
113 * @note: we shouldn't care if this send fail or not (we're in broadcast).
114 */
115 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
116 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
117 if (ret < 0)
118 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
119 fSendDone |= ret > 0;
120 }
121 LogFlowFunc(("Leave %RTbool\n", fSendDone));
122 return fSendDone;
123}
124#endif /* !VBOX_WITH_NAT_SEND2HOME */
125static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
126#ifdef RT_OS_WINDOWS
127static void sorecvfrom_icmp_win(PNATState, struct socket *);
128#else /* RT_OS_WINDOWS */
129static void sorecvfrom_icmp_unix(PNATState, struct socket *);
130#endif /* !RT_OS_WINDOWS */
131
132void
133so_init()
134{
135}
136
137struct socket *
138solookup(struct socket *head, struct in_addr laddr,
139 u_int lport, struct in_addr faddr, u_int fport)
140{
141 struct socket *so;
142
143 for (so = head->so_next; so != head; so = so->so_next)
144 {
145 if ( so->so_lport == lport
146 && so->so_laddr.s_addr == laddr.s_addr
147 && so->so_faddr.s_addr == faddr.s_addr
148 && so->so_fport == fport)
149 return so;
150 }
151
152 return (struct socket *)NULL;
153}
154
155/*
156 * Create a new socket, initialise the fields
157 * It is the responsibility of the caller to
158 * insque() it into the correct linked-list
159 */
160struct socket *
161socreate()
162{
163 struct socket *so;
164
165 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
166 if (so)
167 {
168 so->so_state = SS_NOFDREF;
169 so->s = -1;
170#if !defined(RT_OS_WINDOWS)
171 so->so_poll_index = -1;
172#endif
173 }
174 return so;
175}
176
177/*
178 * remque and free a socket, clobber cache
179 * VBOX_WITH_SLIRP_MT: before sofree queue should be locked, because
180 * in sofree we don't know from which queue item beeing removed.
181 */
182void
183sofree(PNATState pData, struct socket *so)
184{
185 if (so == tcp_last_so)
186 tcp_last_so = &tcb;
187 else if (so == udp_last_so)
188 udp_last_so = &udb;
189
190 /* check if mbuf haven't been already freed */
191 if (so->so_m != NULL)
192 m_freem(pData, so->so_m);
193#ifndef VBOX_WITH_SLIRP_MT
194 if (so->so_next && so->so_prev)
195 {
196 remque(pData, so); /* crashes if so is not in a queue */
197 NSOCK_DEC();
198 }
199
200 RTMemFree(so);
201#else
202 so->so_deleted = 1;
203#endif
204}
205
206#ifdef VBOX_WITH_SLIRP_MT
207void
208soread_queue(PNATState pData, struct socket *so, int *ret)
209{
210 *ret = soread(pData, so);
211}
212#endif
213
214/*
215 * Read from so's socket into sb_snd, updating all relevant sbuf fields
216 * NOTE: This will only be called if it is select()ed for reading, so
217 * a read() of 0 (or less) means it's disconnected
218 */
219#ifndef VBOX_WITH_SLIRP_BSD_SBUF
220int
221soread(PNATState pData, struct socket *so)
222{
223 int n, nn, lss, total;
224 struct sbuf *sb = &so->so_snd;
225 size_t len = sb->sb_datalen - sb->sb_cc;
226 struct iovec iov[2];
227 int mss = so->so_tcpcb->t_maxseg;
228
229 STAM_PROFILE_START(&pData->StatIOread, a);
230 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
231 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
232
233 QSOCKET_LOCK(tcb);
234 SOCKET_LOCK(so);
235 QSOCKET_UNLOCK(tcb);
236
237 LogFlow(("soread: so = %R[natsock]\n", so));
238 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
239
240 /*
241 * No need to check if there's enough room to read.
242 * soread wouldn't have been called if there weren't
243 */
244
245 len = sb->sb_datalen - sb->sb_cc;
246
247 iov[0].iov_base = sb->sb_wptr;
248 iov[1].iov_base = 0;
249 iov[1].iov_len = 0;
250 if (sb->sb_wptr < sb->sb_rptr)
251 {
252 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
253 /* Should never succeed, but... */
254 if (iov[0].iov_len > len)
255 iov[0].iov_len = len;
256 if (iov[0].iov_len > mss)
257 iov[0].iov_len -= iov[0].iov_len%mss;
258 n = 1;
259 }
260 else
261 {
262 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
263 /* Should never succeed, but... */
264 if (iov[0].iov_len > len)
265 iov[0].iov_len = len;
266 len -= iov[0].iov_len;
267 if (len)
268 {
269 iov[1].iov_base = sb->sb_data;
270 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
271 if (iov[1].iov_len > len)
272 iov[1].iov_len = len;
273 total = iov[0].iov_len + iov[1].iov_len;
274 if (total > mss)
275 {
276 lss = total % mss;
277 if (iov[1].iov_len > lss)
278 {
279 iov[1].iov_len -= lss;
280 n = 2;
281 }
282 else
283 {
284 lss -= iov[1].iov_len;
285 iov[0].iov_len -= lss;
286 n = 1;
287 }
288 }
289 else
290 n = 2;
291 }
292 else
293 {
294 if (iov[0].iov_len > mss)
295 iov[0].iov_len -= iov[0].iov_len%mss;
296 n = 1;
297 }
298 }
299
300#ifdef HAVE_READV
301 nn = readv(so->s, (struct iovec *)iov, n);
302#else
303 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
304#endif
305 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
306 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
307 if (nn <= 0)
308 {
309 /*
310 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
311 * _could_ mean that the connection is closed. But we will receive an
312 * FD_CLOSE event later if the connection was _really_ closed. With
313 * www.youtube.com I see this very often. Closing the socket too early
314 * would be dangerous.
315 */
316 int status;
317 unsigned long pending = 0;
318 status = ioctlsocket(so->s, FIONREAD, &pending);
319 if (status < 0)
320 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
321 if (nn == 0 && (pending != 0))
322 {
323 SOCKET_UNLOCK(so);
324 STAM_PROFILE_STOP(&pData->StatIOread, a);
325 return 0;
326 }
327 if ( nn < 0
328 && ( errno == EINTR
329 || errno == EAGAIN
330 || errno == EWOULDBLOCK))
331 {
332 SOCKET_UNLOCK(so);
333 STAM_PROFILE_STOP(&pData->StatIOread, a);
334 return 0;
335 }
336 else
337 {
338 /* nn == 0 means peer has performed an orderly shutdown */
339 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
340 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
341 sofcantrcvmore(so);
342 tcp_sockclosed(pData, sototcpcb(so));
343 SOCKET_UNLOCK(so);
344 STAM_PROFILE_STOP(&pData->StatIOread, a);
345 return -1;
346 }
347 }
348 STAM_STATS(
349 if (n == 1)
350 {
351 STAM_COUNTER_INC(&pData->StatIORead_in_1);
352 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
353 }
354 else
355 {
356 STAM_COUNTER_INC(&pData->StatIORead_in_2);
357 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
358 }
359 );
360
361#ifndef HAVE_READV
362 /*
363 * If there was no error, try and read the second time round
364 * We read again if n = 2 (ie, there's another part of the buffer)
365 * and we read as much as we could in the first read
366 * We don't test for <= 0 this time, because there legitimately
367 * might not be any more data (since the socket is non-blocking),
368 * a close will be detected on next iteration.
369 * A return of -1 wont (shouldn't) happen, since it didn't happen above
370 */
371 if (n == 2 && nn == iov[0].iov_len)
372 {
373 int ret;
374 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
375 if (ret > 0)
376 nn += ret;
377 STAM_STATS(
378 if (ret > 0)
379 {
380 STAM_COUNTER_INC(&pData->StatIORead_in_2);
381 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
382 }
383 );
384 }
385
386 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
387#endif
388
389 /* Update fields */
390 sb->sb_cc += nn;
391 sb->sb_wptr += nn;
392 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
393 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
394 {
395 sb->sb_wptr -= sb->sb_datalen;
396 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
397 }
398 STAM_PROFILE_STOP(&pData->StatIOread, a);
399 SOCKET_UNLOCK(so);
400 return nn;
401}
402#else /* VBOX_WITH_SLIRP_BSD_SBUF */
403int
404soread(PNATState pData, struct socket *so)
405{
406 int n;
407 char *buf;
408 struct sbuf *sb = &so->so_snd;
409 size_t len = sbspace(sb);
410 int mss = so->so_tcpcb->t_maxseg;
411
412 STAM_PROFILE_START(&pData->StatIOread, a);
413 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
414 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
415
416 QSOCKET_LOCK(tcb);
417 SOCKET_LOCK(so);
418 QSOCKET_UNLOCK(tcb);
419
420 LogFlowFunc(("soread: so = %lx\n", (long)so));
421
422 if (len > mss)
423 len -= len % mss;
424 buf = RTMemAlloc(len);
425 if (buf == NULL)
426 {
427 Log(("NAT: can't alloc enough memory\n"));
428 return -1;
429 }
430
431 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
432 if (n <= 0)
433 {
434 /*
435 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
436 * _could_ mean that the connection is closed. But we will receive an
437 * FD_CLOSE event later if the connection was _really_ closed. With
438 * www.youtube.com I see this very often. Closing the socket too early
439 * would be dangerous.
440 */
441 int status;
442 unsigned long pending = 0;
443 status = ioctlsocket(so->s, FIONREAD, &pending);
444 if (status < 0)
445 Log(("NAT:error in WSAIoctl: %d\n", errno));
446 if (n == 0 && (pending != 0))
447 {
448 SOCKET_UNLOCK(so);
449 STAM_PROFILE_STOP(&pData->StatIOread, a);
450 RTMemFree(buf);
451 return 0;
452 }
453 if ( n < 0
454 && ( errno == EINTR
455 || errno == EAGAIN
456 || errno == EWOULDBLOCK))
457 {
458 SOCKET_UNLOCK(so);
459 STAM_PROFILE_STOP(&pData->StatIOread, a);
460 RTMemFree(buf);
461 return 0;
462 }
463 else
464 {
465 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
466 n, errno, strerror(errno)));
467 sofcantrcvmore(so);
468 tcp_sockclosed(pData, sototcpcb(so));
469 SOCKET_UNLOCK(so);
470 STAM_PROFILE_STOP(&pData->StatIOread, a);
471 RTMemFree(buf);
472 return -1;
473 }
474 }
475
476 sbuf_bcat(sb, buf, n);
477 RTMemFree(buf);
478 return n;
479}
480#endif
481
482/*
483 * Get urgent data
484 *
485 * When the socket is created, we set it SO_OOBINLINE,
486 * so when OOB data arrives, we soread() it and everything
487 * in the send buffer is sent as urgent data
488 */
489void
490sorecvoob(PNATState pData, struct socket *so)
491{
492 struct tcpcb *tp = sototcpcb(so);
493 ssize_t ret;
494
495 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
496
497 /*
498 * We take a guess at how much urgent data has arrived.
499 * In most situations, when urgent data arrives, the next
500 * read() should get all the urgent data. This guess will
501 * be wrong however if more data arrives just after the
502 * urgent data, or the read() doesn't return all the
503 * urgent data.
504 */
505 ret = soread(pData, so);
506 if (RT_LIKELY(ret > 0))
507 {
508 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
509 tp->t_force = 1;
510 tcp_output(pData, tp);
511 tp->t_force = 0;
512 }
513}
514#ifndef VBOX_WITH_SLIRP_BSD_SBUF
515/*
516 * Send urgent data
517 * There's a lot duplicated code here, but...
518 */
519int
520sosendoob(struct socket *so)
521{
522 struct sbuf *sb = &so->so_rcv;
523 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
524
525 int n, len;
526
527 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
528
529 if (so->so_urgc > sizeof(buff))
530 so->so_urgc = sizeof(buff); /* XXX */
531
532 if (sb->sb_rptr < sb->sb_wptr)
533 {
534 /* We can send it directly */
535 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
536 so->so_urgc -= n;
537
538 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
539 n, so->so_urgc));
540 }
541 else
542 {
543 /*
544 * Since there's no sendv or sendtov like writev,
545 * we must copy all data to a linear buffer then
546 * send it all
547 */
548 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
549 if (len > so->so_urgc)
550 len = so->so_urgc;
551 memcpy(buff, sb->sb_rptr, len);
552 so->so_urgc -= len;
553 if (so->so_urgc)
554 {
555 n = sb->sb_wptr - sb->sb_data;
556 if (n > so->so_urgc)
557 n = so->so_urgc;
558 memcpy(buff + len, sb->sb_data, n);
559 so->so_urgc -= n;
560 len += n;
561 }
562 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
563#ifdef DEBUG
564 if (n != len)
565 Log(("Didn't send all data urgently XXXXX\n"));
566#endif
567 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
568 n, so->so_urgc));
569 }
570
571 sb->sb_cc -= n;
572 sb->sb_rptr += n;
573 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
574 sb->sb_rptr -= sb->sb_datalen;
575
576 return n;
577}
578
579/*
580 * Write data from so_rcv to so's socket,
581 * updating all sbuf field as necessary
582 */
583int
584sowrite(PNATState pData, struct socket *so)
585{
586 int n, nn;
587 struct sbuf *sb = &so->so_rcv;
588 size_t len = sb->sb_cc;
589 struct iovec iov[2];
590
591 STAM_PROFILE_START(&pData->StatIOwrite, a);
592 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
593 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
594 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
595 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
596 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
597 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
598 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
599 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
600 LogFlowFunc(("so = %R[natsock]\n", so));
601 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
602 QSOCKET_LOCK(tcb);
603 SOCKET_LOCK(so);
604 QSOCKET_UNLOCK(tcb);
605 if (so->so_urgc)
606 {
607 sosendoob(so);
608 if (sb->sb_cc == 0)
609 {
610 SOCKET_UNLOCK(so);
611 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
612 return 0;
613 }
614 }
615
616 /*
617 * No need to check if there's something to write,
618 * sowrite wouldn't have been called otherwise
619 */
620
621 len = sb->sb_cc;
622
623 iov[0].iov_base = sb->sb_rptr;
624 iov[1].iov_base = 0;
625 iov[1].iov_len = 0;
626 if (sb->sb_rptr < sb->sb_wptr)
627 {
628 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
629 /* Should never succeed, but... */
630 if (iov[0].iov_len > len)
631 iov[0].iov_len = len;
632 n = 1;
633 }
634 else
635 {
636 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
637 if (iov[0].iov_len > len)
638 iov[0].iov_len = len;
639 len -= iov[0].iov_len;
640 if (len)
641 {
642 iov[1].iov_base = sb->sb_data;
643 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
644 if (iov[1].iov_len > len)
645 iov[1].iov_len = len;
646 n = 2;
647 }
648 else
649 n = 1;
650 }
651 STAM_STATS({
652 if (n == 1)
653 {
654 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
655 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
656 }
657 else
658 {
659 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
660 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
661 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
662 }
663 });
664 /* Check if there's urgent data to send, and if so, send it */
665#ifdef HAVE_READV
666 nn = writev(so->s, (const struct iovec *)iov, n);
667#else
668 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
669#endif
670 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
671 /* This should never happen, but people tell me it does *shrug* */
672 if ( nn < 0
673 && ( errno == EAGAIN
674 || errno == EINTR
675 || errno == EWOULDBLOCK))
676 {
677 SOCKET_UNLOCK(so);
678 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
679 return 0;
680 }
681
682 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
683 {
684 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
685 __PRETTY_FUNCTION__, so->so_state, errno));
686 sofcantsendmore(so);
687 tcp_sockclosed(pData, sototcpcb(so));
688 SOCKET_UNLOCK(so);
689 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
690 return -1;
691 }
692
693#ifndef HAVE_READV
694 if (n == 2 && nn == iov[0].iov_len)
695 {
696 int ret;
697 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
698 if (ret > 0)
699 nn += ret;
700 STAM_STATS({
701 if (ret > 0 && ret != iov[1].iov_len)
702 {
703 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
704 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
705 }
706 });
707 }
708 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
709#endif
710
711 /* Update sbuf */
712 sb->sb_cc -= nn;
713 sb->sb_rptr += nn;
714 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
715 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
716 {
717 sb->sb_rptr -= sb->sb_datalen;
718 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
719 }
720
721 /*
722 * If in DRAIN mode, and there's no more data, set
723 * it CANTSENDMORE
724 */
725 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
726 sofcantsendmore(so);
727
728 SOCKET_UNLOCK(so);
729 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
730 return nn;
731}
732#else /* VBOX_WITH_SLIRP_BSD_SBUF */
733static int
734do_sosend(struct socket *so, int fUrg)
735{
736 struct sbuf *sb = &so->so_rcv;
737
738 int n, len;
739
740 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
741
742 len = sbuf_len(sb);
743
744 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
745 if (n < 0)
746 Log(("NAT: Can't sent sbuf via socket.\n"));
747 if (fUrg)
748 so->so_urgc -= n;
749 if (n > 0 && n < len)
750 {
751 char *ptr;
752 char *buff;
753 buff = RTMemAlloc(len);
754 if (buff == NULL)
755 {
756 Log(("NAT: No space to allocate temporal buffer\n"));
757 return -1;
758 }
759 ptr = sbuf_data(sb);
760 memcpy(buff, &ptr[n], len - n);
761 sbuf_bcpy(sb, buff, len - n);
762 RTMemFree(buff);
763 return n;
764 }
765 sbuf_clear(sb);
766 return n;
767}
768int
769sosendoob(struct socket *so)
770{
771 return do_sosend(so, 1);
772}
773
774/*
775 * Write data from so_rcv to so's socket,
776 * updating all sbuf field as necessary
777 */
778int
779sowrite(PNATState pData, struct socket *so)
780{
781 return do_sosend(so, 0);
782}
783#endif
784
785/*
786 * recvfrom() a UDP socket
787 */
788void
789sorecvfrom(PNATState pData, struct socket *so)
790{
791 ssize_t ret = 0;
792 struct sockaddr_in addr;
793 socklen_t addrlen = sizeof(struct sockaddr_in);
794
795 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
796
797 if (so->so_type == IPPROTO_ICMP)
798 {
799 /* This is a "ping" reply */
800#ifdef RT_OS_WINDOWS
801 sorecvfrom_icmp_win(pData, so);
802#else /* RT_OS_WINDOWS */
803 sorecvfrom_icmp_unix(pData, so);
804#endif /* !RT_OS_WINDOWS */
805 udp_detach(pData, so);
806 }
807 else
808 {
809 /* A "normal" UDP packet */
810 struct mbuf *m;
811 ssize_t len;
812 u_long n = 0;
813 int rc = 0;
814 static int signalled = 0;
815 char *pchBuffer = NULL;
816 bool fWithTemporalBuffer = false;
817
818 QSOCKET_LOCK(udb);
819 SOCKET_LOCK(so);
820 QSOCKET_UNLOCK(udb);
821
822 /*How many data has been received ?*/
823 /*
824 * 1. calculate how much we can read
825 * 2. read as much as possible
826 * 3. attach buffer to allocated header mbuf
827 */
828 rc = ioctlsocket(so->s, FIONREAD, &n);
829 if (rc == -1)
830 {
831 if ( errno == EAGAIN
832 || errno == EWOULDBLOCK
833 || errno == EINPROGRESS
834 || errno == ENOTCONN)
835 return;
836 else if (signalled == 0)
837 {
838 LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
839 signalled = 1;
840 }
841 return;
842 }
843
844 len = sizeof(struct udpiphdr);
845 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
846 if (m == NULL)
847 return;
848
849 len += n;
850 m->m_data += ETH_HLEN;
851 m->m_pkthdr.header = mtod(m, void *);
852 m->m_data += sizeof(struct udpiphdr);
853
854 pchBuffer = mtod(m, char *);
855 fWithTemporalBuffer = false;
856 /*
857 * Even if amounts of bytes on socket is greater than MTU value
858 * Slirp will able fragment it, but we won't create temporal location
859 * here.
860 */
861 if (n > (slirp_size(pData) - sizeof(struct udpiphdr)))
862 {
863 pchBuffer = RTMemAlloc((n) * sizeof(char));
864 if (!pchBuffer)
865 {
866 m_freem(pData, m);
867 return;
868 }
869 fWithTemporalBuffer = true;
870 }
871 ret = recvfrom(so->s, pchBuffer, n, 0,
872 (struct sockaddr *)&addr, &addrlen);
873 if (fWithTemporalBuffer)
874 {
875 if (ret > 0)
876 {
877 m_copyback(pData, m, 0, ret, pchBuffer);
878 /*
879 * If we've met comporison below our size prediction was failed
880 * it's not fatal just we've allocated for nothing. (@todo add counter here
881 * to calculate how rare we here)
882 */
883 if(ret < slirp_size(pData) && !m->m_next)
884 Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
885 n, ret, slirp_size(pData)));
886 }
887 /* we're freeing buffer anyway */
888 RTMemFree(pchBuffer);
889 }
890 else
891 m->m_len = ret;
892
893 if (ret < 0)
894 {
895 u_char code = ICMP_UNREACH_PORT;
896
897 if (errno == EHOSTUNREACH)
898 code = ICMP_UNREACH_HOST;
899 else if (errno == ENETUNREACH)
900 code = ICMP_UNREACH_NET;
901
902 m_freem(pData, m);
903 if ( errno == EAGAIN
904 || errno == EWOULDBLOCK
905 || errno == EINPROGRESS
906 || errno == ENOTCONN)
907 {
908 return;
909 }
910
911 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
912 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
913 so->so_m = NULL;
914 }
915 else
916 {
917 Assert((m_length(m,NULL) == ret));
918 /*
919 * Hack: domain name lookup will be used the most for UDP,
920 * and since they'll only be used once there's no need
921 * for the 4 minute (or whatever) timeout... So we time them
922 * out much quicker (10 seconds for now...)
923 */
924 if (so->so_expire)
925 {
926 if (so->so_fport != RT_H2N_U16_C(53))
927 so->so_expire = curtime + SO_EXPIRE;
928 }
929 /*
930 * last argument should be changed if Slirp will inject IP attributes
931 * Note: Here we can't check if dnsproxy's sent initial request
932 */
933 if ( pData->fUseDnsProxy
934 && so->so_fport == RT_H2N_U16_C(53))
935 dnsproxy_answer(pData, so, m);
936
937#if 0
938 if (m->m_len == len)
939 {
940 m_inc(m, MINCSIZE);
941 m->m_len = 0;
942 }
943#endif
944
945 /* packets definetly will be fragmented, could confuse receiver peer. */
946 if (m_length(m, NULL) > if_mtu)
947 m->m_flags |= M_SKIP_FIREWALL;
948 /*
949 * If this packet was destined for CTL_ADDR,
950 * make it look like that's where it came from, done by udp_output
951 */
952 udp_output(pData, so, m, &addr);
953 SOCKET_UNLOCK(so);
954 } /* rx error */
955 } /* if ping packet */
956}
957
958/*
959 * sendto() a socket
960 */
961int
962sosendto(PNATState pData, struct socket *so, struct mbuf *m)
963{
964 int ret;
965 struct sockaddr_in *paddr;
966 struct sockaddr addr;
967#if 0
968 struct sockaddr_in host_addr;
969#endif
970 caddr_t buf = 0;
971 int mlen;
972
973 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
974
975 memset(&addr, 0, sizeof(struct sockaddr));
976#ifdef RT_OS_DARWIN
977 addr.sa_len = sizeof(struct sockaddr_in);
978#endif
979 paddr = (struct sockaddr_in *)&addr;
980 paddr->sin_family = AF_INET;
981 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
982 {
983 /* It's an alias */
984 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
985 switch(last_byte)
986 {
987#if 0
988 /* handle this case at 'default:' */
989 case CTL_BROADCAST:
990 addr.sin_addr.s_addr = INADDR_BROADCAST;
991 /* Send the packet to host to fully emulate broadcast */
992 /** @todo r=klaus: on Linux host this causes the host to receive
993 * the packet twice for some reason. And I cannot find any place
994 * in the man pages which states that sending a broadcast does not
995 * reach the host itself. */
996 host_addr.sin_family = AF_INET;
997 host_addr.sin_port = so->so_fport;
998 host_addr.sin_addr = our_addr;
999 sendto(so->s, m->m_data, m->m_len, 0,
1000 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
1001 break;
1002#endif
1003 case CTL_DNS:
1004 case CTL_ALIAS:
1005 default:
1006 if (last_byte == ~pData->netmask)
1007 paddr->sin_addr.s_addr = INADDR_BROADCAST;
1008 else
1009 paddr->sin_addr = loopback_addr;
1010 break;
1011 }
1012 }
1013 else
1014 paddr->sin_addr = so->so_faddr;
1015 paddr->sin_port = so->so_fport;
1016
1017 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1018 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1019
1020 /* Don't care what port we get */
1021 /*
1022 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1023 * generates bodyless messages, annoying memmory management system.
1024 */
1025 mlen = m_length(m, NULL);
1026 if (mlen > 0)
1027 {
1028 buf = RTMemAlloc(mlen);
1029 if (buf == NULL)
1030 {
1031 return -1;
1032 }
1033 m_copydata(m, 0, mlen, buf);
1034 }
1035 ret = sendto(so->s, buf, mlen, 0,
1036 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1037#ifdef VBOX_WITH_NAT_SEND2HOME
1038 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1039 {
1040 slirpSend2Home(pData, so, buf, mlen, 0);
1041 }
1042#endif
1043 if (buf)
1044 RTMemFree(buf);
1045 if (ret < 0)
1046 {
1047 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1048 return -1;
1049 }
1050
1051 /*
1052 * Kill the socket if there's no reply in 4 minutes,
1053 * but only if it's an expirable socket
1054 */
1055 if (so->so_expire)
1056 so->so_expire = curtime + SO_EXPIRE;
1057 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1058 return 0;
1059}
1060
1061/*
1062 * XXX This should really be tcp_listen
1063 */
1064struct socket *
1065solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1066{
1067 struct sockaddr_in addr;
1068 struct socket *so;
1069 socklen_t addrlen = sizeof(addr);
1070 int s, opt = 1;
1071 int status;
1072
1073 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1074
1075 if ((so = socreate()) == NULL)
1076 {
1077 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1078 return NULL;
1079 }
1080
1081 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1082 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1083 {
1084 RTMemFree(so);
1085 return NULL;
1086 }
1087
1088 SOCKET_LOCK_CREATE(so);
1089 SOCKET_LOCK(so);
1090 QSOCKET_LOCK(tcb);
1091 insque(pData, so,&tcb);
1092 NSOCK_INC();
1093 QSOCKET_UNLOCK(tcb);
1094
1095 /*
1096 * SS_FACCEPTONCE sockets must time out.
1097 */
1098 if (flags & SS_FACCEPTONCE)
1099 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1100
1101 so->so_state = (SS_FACCEPTCONN|flags);
1102 so->so_lport = lport; /* Kept in network format */
1103 so->so_laddr.s_addr = laddr; /* Ditto */
1104
1105 memset(&addr, 0, sizeof(addr));
1106#ifdef RT_OS_DARWIN
1107 addr.sin_len = sizeof(addr);
1108#endif
1109 addr.sin_family = AF_INET;
1110 addr.sin_addr.s_addr = bind_addr;
1111 addr.sin_port = port;
1112
1113 /**
1114 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1115 * kernel will choose the optimal value for requests queue length.
1116 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1117 */
1118 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1119 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1120 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1121 || (listen(s, pData->soMaxConn) < 0))
1122 {
1123#ifdef RT_OS_WINDOWS
1124 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1125 closesocket(s);
1126 QSOCKET_LOCK(tcb);
1127 sofree(pData, so);
1128 QSOCKET_UNLOCK(tcb);
1129 /* Restore the real errno */
1130 WSASetLastError(tmperrno);
1131#else
1132 int tmperrno = errno; /* Don't clobber the real reason we failed */
1133 close(s);
1134 QSOCKET_LOCK(tcb);
1135 sofree(pData, so);
1136 QSOCKET_UNLOCK(tcb);
1137 /* Restore the real errno */
1138 errno = tmperrno;
1139#endif
1140 return NULL;
1141 }
1142 fd_nonblock(s);
1143 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1144
1145 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1146 so->so_fport = addr.sin_port;
1147 /* set socket buffers */
1148 opt = pData->socket_rcv;
1149 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1150 if (status < 0)
1151 {
1152 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1153 goto no_sockopt;
1154 }
1155 opt = pData->socket_snd;
1156 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1157 if (status < 0)
1158 {
1159 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1160 goto no_sockopt;
1161 }
1162no_sockopt:
1163 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1164 so->so_faddr = alias_addr;
1165 else
1166 so->so_faddr = addr.sin_addr;
1167
1168 so->s = s;
1169 SOCKET_UNLOCK(so);
1170 return so;
1171}
1172
1173/*
1174 * Data is available in so_rcv
1175 * Just write() the data to the socket
1176 * XXX not yet...
1177 * @todo do we really need this function, what it's intended to do?
1178 */
1179void
1180sorwakeup(struct socket *so)
1181{
1182 NOREF(so);
1183#if 0
1184 sowrite(so);
1185 FD_CLR(so->s,&writefds);
1186#endif
1187}
1188
1189/*
1190 * Data has been freed in so_snd
1191 * We have room for a read() if we want to
1192 * For now, don't read, it'll be done in the main loop
1193 */
1194void
1195sowwakeup(struct socket *so)
1196{
1197 NOREF(so);
1198}
1199
1200/*
1201 * Various session state calls
1202 * XXX Should be #define's
1203 * The socket state stuff needs work, these often get call 2 or 3
1204 * times each when only 1 was needed
1205 */
1206void
1207soisfconnecting(struct socket *so)
1208{
1209 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1210 SS_FCANTSENDMORE|SS_FWDRAIN);
1211 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1212}
1213
1214void
1215soisfconnected(struct socket *so)
1216{
1217 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1218 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1219 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1220 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1221}
1222
1223void
1224sofcantrcvmore(struct socket *so)
1225{
1226 if ((so->so_state & SS_NOFDREF) == 0)
1227 {
1228 shutdown(so->s, 0);
1229 }
1230 so->so_state &= ~(SS_ISFCONNECTING);
1231 if (so->so_state & SS_FCANTSENDMORE)
1232 so->so_state = SS_NOFDREF; /* Don't select it */
1233 /* XXX close() here as well? */
1234 else
1235 so->so_state |= SS_FCANTRCVMORE;
1236}
1237
1238void
1239sofcantsendmore(struct socket *so)
1240{
1241 if ((so->so_state & SS_NOFDREF) == 0)
1242 shutdown(so->s, 1); /* send FIN to fhost */
1243
1244 so->so_state &= ~(SS_ISFCONNECTING);
1245 if (so->so_state & SS_FCANTRCVMORE)
1246 so->so_state = SS_NOFDREF; /* as above */
1247 else
1248 so->so_state |= SS_FCANTSENDMORE;
1249}
1250
1251void
1252soisfdisconnected(struct socket *so)
1253{
1254 NOREF(so);
1255#if 0
1256 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1257 close(so->s);
1258 so->so_state = SS_ISFDISCONNECTED;
1259 /*
1260 * XXX Do nothing ... ?
1261 */
1262#endif
1263}
1264
1265/*
1266 * Set write drain mode
1267 * Set CANTSENDMORE once all data has been write()n
1268 */
1269void
1270sofwdrain(struct socket *so)
1271{
1272 if (SBUF_LEN(&so->so_rcv))
1273 so->so_state |= SS_FWDRAIN;
1274 else
1275 sofcantsendmore(so);
1276}
1277
1278static void
1279send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1280{
1281 struct ip *ip;
1282 uint32_t dst, src;
1283 char ip_copy[256];
1284 struct icmp *icp;
1285 int old_ip_len = 0;
1286 int hlen, original_hlen = 0;
1287 struct mbuf *m;
1288 struct icmp_msg *icm;
1289 uint8_t proto;
1290 int type = 0;
1291
1292 ip = (struct ip *)buff;
1293 /* Fix ip->ip_len to contain the total packet length including the header
1294 * in _host_ byte order for all OSes. On Darwin, that value already is in
1295 * host byte order. Solaris and Darwin report only the payload. */
1296#ifndef RT_OS_DARWIN
1297 ip->ip_len = RT_N2H_U16(ip->ip_len);
1298#endif
1299 hlen = (ip->ip_hl << 2);
1300#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1301 ip->ip_len += hlen;
1302#endif
1303 if (ip->ip_len < hlen + ICMP_MINLEN)
1304 {
1305 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1306 return;
1307 }
1308 icp = (struct icmp *)((char *)ip + hlen);
1309
1310 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1311 if ( icp->icmp_type != ICMP_ECHOREPLY
1312 && icp->icmp_type != ICMP_TIMXCEED
1313 && icp->icmp_type != ICMP_UNREACH)
1314 {
1315 return;
1316 }
1317
1318 /*
1319 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1320 * ICMP_ECHOREPLY assuming data 0
1321 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1322 */
1323 if (ip->ip_len < hlen + 8)
1324 {
1325 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1326 return;
1327 }
1328
1329 type = icp->icmp_type;
1330 if ( type == ICMP_TIMXCEED
1331 || type == ICMP_UNREACH)
1332 {
1333 /*
1334 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1335 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1336 */
1337 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1338 {
1339 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1340 return;
1341 }
1342 ip = &icp->icmp_ip;
1343 }
1344
1345 icm = icmp_find_original_mbuf(pData, ip);
1346 if (icm == NULL)
1347 {
1348 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1349 return;
1350 }
1351
1352 m = icm->im_m;
1353 Assert(m != NULL);
1354
1355 src = addr->sin_addr.s_addr;
1356 if (type == ICMP_ECHOREPLY)
1357 {
1358 struct ip *ip0 = mtod(m, struct ip *);
1359 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1360 if (icp0->icmp_type != ICMP_ECHO)
1361 {
1362 Log(("NAT: we haven't found echo for this reply\n"));
1363 return;
1364 }
1365 /*
1366 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1367 * IP header combined by OS network stack, our local copy of IP header contians values
1368 * in host byte order so no byte order conversion is required. IP headers fields are converting
1369 * in ip_output0 routine only.
1370 */
1371 if ( (ip->ip_len - hlen)
1372 != (ip0->ip_len - (ip0->ip_hl << 2)))
1373 {
1374 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1375 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1376 return;
1377 }
1378 }
1379
1380 /* ip points on origianal ip header */
1381 ip = mtod(m, struct ip *);
1382 proto = ip->ip_p;
1383 /* Now ip is pointing on header we've sent from guest */
1384 if ( icp->icmp_type == ICMP_TIMXCEED
1385 || icp->icmp_type == ICMP_UNREACH)
1386 {
1387 old_ip_len = (ip->ip_hl << 2) + 64;
1388 if (old_ip_len > sizeof(ip_copy))
1389 old_ip_len = sizeof(ip_copy);
1390 memcpy(ip_copy, ip, old_ip_len);
1391 }
1392
1393 /* source address from original IP packet*/
1394 dst = ip->ip_src.s_addr;
1395
1396 /* overide ther tail of old packet */
1397 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1398 original_hlen = ip->ip_hl << 2;
1399 /* saves original ip header and options */
1400 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1401 ip->ip_len = m_length(m, NULL);
1402 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1403
1404 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1405 type = icp->icmp_type;
1406 if ( type == ICMP_TIMXCEED
1407 || type == ICMP_UNREACH)
1408 {
1409 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1410 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1411 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1412 }
1413
1414 ip->ip_src.s_addr = src;
1415 ip->ip_dst.s_addr = dst;
1416 icmp_reflect(pData, m);
1417 LIST_REMOVE(icm, im_list);
1418 pData->cIcmpCacheSize--;
1419 /* Don't call m_free here*/
1420
1421 if ( type == ICMP_TIMXCEED
1422 || type == ICMP_UNREACH)
1423 {
1424 icm->im_so->so_m = NULL;
1425 switch (proto)
1426 {
1427 case IPPROTO_UDP:
1428 /*XXX: so->so_m already freed so we shouldn't call sofree */
1429 udp_detach(pData, icm->im_so);
1430 break;
1431 case IPPROTO_TCP:
1432 /*close tcp should be here */
1433 break;
1434 default:
1435 /* do nothing */
1436 break;
1437 }
1438 }
1439 RTMemFree(icm);
1440}
1441
1442#ifdef RT_OS_WINDOWS
1443static void
1444sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1445{
1446 int len;
1447 int i;
1448 struct ip *ip;
1449 struct mbuf *m;
1450 struct icmp *icp;
1451 struct icmp_msg *icm;
1452 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1453 uint32_t src;
1454 ICMP_ECHO_REPLY *icr;
1455 int hlen = 0;
1456 int nbytes = 0;
1457 u_char code = ~0;
1458 int out_len;
1459 int size;
1460
1461 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->szIcmpBuffer);
1462 if (len < 0)
1463 {
1464 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1465 return;
1466 }
1467 if (len == 0)
1468 return; /* no error */
1469
1470 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1471 for (i = 0; i < len; ++i)
1472 {
1473 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1474 i, icr[i].Data, icr[i].DataSize));
1475 switch(icr[i].Status)
1476 {
1477 case IP_DEST_HOST_UNREACHABLE:
1478 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1479 case IP_DEST_NET_UNREACHABLE:
1480 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1481 case IP_DEST_PROT_UNREACHABLE:
1482 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1483 /* UNREACH error inject here */
1484 case IP_DEST_PORT_UNREACHABLE:
1485 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1486 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1487 so->so_m = NULL;
1488 break;
1489 case IP_SUCCESS: /* echo replied */
1490 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1491 size;
1492 size = MCLBYTES;
1493 if (out_len < MSIZE)
1494 size = MCLBYTES;
1495 else if (out_len < MCLBYTES)
1496 size = MCLBYTES;
1497 else if (out_len < MJUM9BYTES)
1498 size = MJUM9BYTES;
1499 else if (out_len < MJUM16BYTES)
1500 size = MJUM16BYTES;
1501 else
1502 AssertMsgFailed(("Unsupported size"));
1503
1504 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1505 LogFunc(("m_getjcl returns m: %p\n", m));
1506 if (m == NULL)
1507 return;
1508 m->m_len = 0;
1509 m->m_data += if_maxlinkhdr;
1510 m->m_pkthdr.header = mtod(m, void *);
1511
1512 ip = mtod(m, struct ip *);
1513 ip->ip_src.s_addr = icr[i].Address;
1514 ip->ip_p = IPPROTO_ICMP;
1515 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1516 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1517 ip->ip_ttl = icr[i].Options.Ttl;
1518
1519 icp = (struct icmp *)&ip[1]; /* no options */
1520 icp->icmp_type = ICMP_ECHOREPLY;
1521 icp->icmp_code = 0;
1522 icp->icmp_id = so->so_icmp_id;
1523 icp->icmp_seq = so->so_icmp_seq;
1524
1525 icm = icmp_find_original_mbuf(pData, ip);
1526 if (icm)
1527 {
1528 /* on this branch we don't need stored variant */
1529 m_freem(pData, icm->im_m);
1530 LIST_REMOVE(icm, im_list);
1531 pData->cIcmpCacheSize--;
1532 RTMemFree(icm);
1533 }
1534
1535
1536 hlen = (ip->ip_hl << 2);
1537 Assert((hlen >= sizeof(struct ip)));
1538
1539 m->m_data += hlen + ICMP_MINLEN;
1540 if (!RT_VALID_PTR(icr[i].Data))
1541 {
1542 m_freem(pData, m);
1543 break;
1544 }
1545 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1546 m->m_data -= hlen + ICMP_MINLEN;
1547 m->m_len += hlen + ICMP_MINLEN;
1548
1549
1550 ip->ip_len = m_length(m, NULL);
1551 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1552
1553 icmp_reflect(pData, m);
1554 break;
1555 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1556
1557 ip_broken = icr[i].Data;
1558 icm = icmp_find_original_mbuf(pData, ip_broken);
1559 if (icm == NULL) {
1560 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1561 return;
1562 }
1563 m = icm->im_m;
1564 ip = mtod(m, struct ip *);
1565 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1566 ip->ip_ttl = icr[i].Options.Ttl;
1567 src = ip->ip_src.s_addr;
1568 ip->ip_dst.s_addr = src;
1569 ip->ip_dst.s_addr = icr[i].Address;
1570
1571 hlen = (ip->ip_hl << 2);
1572 icp = (struct icmp *)((char *)ip + hlen);
1573 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1574
1575 m->m_len = (ip_broken->ip_hl << 2) + 64;
1576 m->m_pkthdr.header = mtod(m, void *);
1577 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1578 icmp_reflect(pData, m);
1579 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1580 LIST_REMOVE(icm, im_list);
1581 pData->cIcmpCacheSize--;
1582 RTMemFree(icm);
1583 break;
1584 default:
1585 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1586 break;
1587 }
1588 }
1589}
1590#else /* !RT_OS_WINDOWS */
1591static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1592{
1593 struct sockaddr_in addr;
1594 socklen_t addrlen = sizeof(struct sockaddr_in);
1595 struct ip ip;
1596 char *buff;
1597 int len = 0;
1598
1599 /* 1- step: read the ip header */
1600 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1601 (struct sockaddr *)&addr, &addrlen);
1602 if ( len < 0
1603 && ( errno == EAGAIN
1604 || errno == EWOULDBLOCK
1605 || errno == EINPROGRESS
1606 || errno == ENOTCONN))
1607 {
1608 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1609 return;
1610 }
1611
1612 if ( len < sizeof(struct ip)
1613 || len < 0
1614 || len == 0)
1615 {
1616 u_char code;
1617 code = ICMP_UNREACH_PORT;
1618
1619 if (errno == EHOSTUNREACH)
1620 code = ICMP_UNREACH_HOST;
1621 else if (errno == ENETUNREACH)
1622 code = ICMP_UNREACH_NET;
1623
1624 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1625 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1626 so->so_m = NULL;
1627 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1628 return;
1629 }
1630 /* basic check of IP header */
1631 if ( ip.ip_v != IPVERSION
1632# ifndef RT_OS_DARWIN
1633 || ip.ip_p != IPPROTO_ICMP
1634# endif
1635 )
1636 {
1637 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1638 return;
1639 }
1640# ifndef RT_OS_DARWIN
1641 /* Darwin reports the IP length already in host byte order. */
1642 ip.ip_len = RT_N2H_U16(ip.ip_len);
1643# endif
1644# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1645 /* Solaris and Darwin report the payload only */
1646 ip.ip_len += (ip.ip_hl << 2);
1647# endif
1648 /* Note: ip->ip_len in host byte order (all OS) */
1649 len = ip.ip_len;
1650 buff = RTMemAlloc(len);
1651 if (buff == NULL)
1652 {
1653 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1654 return;
1655 }
1656 /* 2 - step: we're reading rest of the datagramm to the buffer */
1657 addrlen = sizeof(struct sockaddr_in);
1658 memset(&addr, 0, addrlen);
1659 len = recvfrom(so->s, buff, len, 0,
1660 (struct sockaddr *)&addr, &addrlen);
1661 if ( len < 0
1662 && ( errno == EAGAIN
1663 || errno == EWOULDBLOCK
1664 || errno == EINPROGRESS
1665 || errno == ENOTCONN))
1666 {
1667 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1668 ip.ip_len));
1669 RTMemFree(buff);
1670 return;
1671 }
1672 if ( len < 0
1673 || len == 0)
1674 {
1675 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1676 errno, len, (ip.ip_len - sizeof(struct ip))));
1677 RTMemFree(buff);
1678 return;
1679 }
1680 /* len is modified in 2nd read, when the rest of the datagramm was read */
1681 send_icmp_to_guest(pData, buff, len, &addr);
1682 RTMemFree(buff);
1683}
1684#endif /* !RT_OS_WINDOWS */
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette