VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 52798

最後變更 在這個檔案從52798是 52798,由 vboxsync 提交於 10 年 前

NAT: when outgoing connect(2) fails decide what to do based on the
errno. The code to call icmp_error() is a bit icky because
tcp_input() vivisects incoming datagram and original mbuf can't be
used.

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 49.7 KB
 
1/* $Id: socket.c 52798 2014-09-21 21:19:38Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.alldomusa.eu.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iphlpapi.h>
36#include <icmpapi.h>
37#endif
38
39#if defined(DECLARE_IOVEC) && defined(RT_OS_WINDOWS)
40AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, WSABUF, buf);
41AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, WSABUF, len);
42#endif
43
44#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
45/**
46 *
47 */
48struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
49{
50 struct socket *pNewSocket = NULL;
51 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
52 pNewSocket = socreate();
53 if (!pNewSocket)
54 {
55 LogFunc(("Can't create socket\n"));
56 LogFlowFunc(("Leave: NULL\n"));
57 return NULL;
58 }
59 if (fBindSocket)
60 {
61 if (udp_attach(pData, pNewSocket, 0) <= 0)
62 {
63 sofree(pData, pNewSocket);
64 LogFunc(("Can't attach fresh created socket\n"));
65 return NULL;
66 }
67 }
68 else
69 {
70 pNewSocket->so_cloneOf = (struct socket *)pSo;
71 pNewSocket->s = pSo->s;
72 insque(pData, pNewSocket, &udb);
73 }
74 pNewSocket->so_laddr = pSo->so_laddr;
75 pNewSocket->so_lport = pSo->so_lport;
76 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
77 pNewSocket->so_fport = pSo->so_fport;
78 pSo->so_cCloneCounter++;
79 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
80 return pNewSocket;
81}
82
83struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
84{
85 struct socket *pSoClone = NULL;
86 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
87 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
88 {
89 if ( pSoClone->so_cloneOf
90 && pSoClone->so_cloneOf == pcSo
91 && pSoClone->so_lport == pcSo->so_lport
92 && pSoClone->so_fport == pcSo->so_fport
93 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
94 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
95 goto done;
96 }
97 pSoClone = NULL;
98done:
99 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
100 return pSoClone;
101}
102#endif
103
104#ifdef VBOX_WITH_NAT_SEND2HOME
105DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
106{
107 int idxAddr;
108 int ret = 0;
109 bool fSendDone = false;
110 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
111 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
112 {
113
114 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
115 AssertReturn((pNewSocket, false));
116 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
117 /* @todo: more verbose on errors,
118 * @note: we shouldn't care if this send fail or not (we're in broadcast).
119 */
120 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
121 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
122 if (ret < 0)
123 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
124 fSendDone |= ret > 0;
125 }
126 LogFlowFunc(("Leave %RTbool\n", fSendDone));
127 return fSendDone;
128}
129#endif /* !VBOX_WITH_NAT_SEND2HOME */
130static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
131#ifdef RT_OS_WINDOWS
132static void sorecvfrom_icmp_win(PNATState, struct socket *);
133#else /* RT_OS_WINDOWS */
134static void sorecvfrom_icmp_unix(PNATState, struct socket *);
135#endif /* !RT_OS_WINDOWS */
136
137void
138so_init()
139{
140}
141
142struct socket *
143solookup(struct socket *head, struct in_addr laddr,
144 u_int lport, struct in_addr faddr, u_int fport)
145{
146 struct socket *so;
147
148 for (so = head->so_next; so != head; so = so->so_next)
149 {
150 if ( so->so_lport == lport
151 && so->so_laddr.s_addr == laddr.s_addr
152 && so->so_faddr.s_addr == faddr.s_addr
153 && so->so_fport == fport)
154 return so;
155 }
156
157 return (struct socket *)NULL;
158}
159
160/*
161 * Create a new socket, initialise the fields
162 * It is the responsibility of the caller to
163 * insque() it into the correct linked-list
164 */
165struct socket *
166socreate()
167{
168 struct socket *so;
169
170 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
171 if (so)
172 {
173 so->so_state = SS_NOFDREF;
174 so->s = -1;
175#if !defined(RT_OS_WINDOWS)
176 so->so_poll_index = -1;
177#endif
178 }
179 return so;
180}
181
182/*
183 * remque and free a socket, clobber cache
184 */
185void
186sofree(PNATState pData, struct socket *so)
187{
188 LogFlowFunc(("ENTER:%R[natsock]\n", so));
189 /*
190 * We should not remove socket when polling routine do the polling
191 * instead we mark it for deletion.
192 */
193 if (so->fUnderPolling)
194 {
195 so->fShouldBeRemoved = 1;
196 LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so));
197 return;
198 }
199 /**
200 * Check that we don't freeng socket with tcbcb
201 */
202 Assert(!sototcpcb(so));
203 /* udp checks */
204 Assert(!so->so_timeout);
205 Assert(!so->so_timeout_arg);
206 if (so == tcp_last_so)
207 tcp_last_so = &tcb;
208 else if (so == udp_last_so)
209 udp_last_so = &udb;
210
211 /* check if mbuf haven't been already freed */
212 if (so->so_m != NULL)
213 {
214 m_freem(pData, so->so_m);
215 so->so_m = NULL;
216 }
217
218 if (so->so_ohdr != NULL)
219 {
220 RTMemFree(so->so_ohdr);
221 so->so_ohdr = NULL;
222 }
223
224 if (so->so_next && so->so_prev)
225 {
226 remque(pData, so); /* crashes if so is not in a queue */
227 NSOCK_DEC();
228 }
229
230 RTMemFree(so);
231 LogFlowFuncLeave();
232}
233
234/*
235 * Read from so's socket into sb_snd, updating all relevant sbuf fields
236 * NOTE: This will only be called if it is select()ed for reading, so
237 * a read() of 0 (or less) means it's disconnected
238 */
239#ifndef VBOX_WITH_SLIRP_BSD_SBUF
240int
241soread(PNATState pData, struct socket *so)
242{
243 int n, nn, lss, total;
244 struct sbuf *sb = &so->so_snd;
245 size_t len = sb->sb_datalen - sb->sb_cc;
246 struct iovec iov[2];
247 int mss = so->so_tcpcb->t_maxseg;
248
249 STAM_PROFILE_START(&pData->StatIOread, a);
250 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
251 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
252
253 QSOCKET_LOCK(tcb);
254 SOCKET_LOCK(so);
255 QSOCKET_UNLOCK(tcb);
256
257 LogFlow(("soread: so = %R[natsock]\n", so));
258 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
259
260 /*
261 * No need to check if there's enough room to read.
262 * soread wouldn't have been called if there weren't
263 */
264
265 len = sb->sb_datalen - sb->sb_cc;
266
267 iov[0].iov_base = sb->sb_wptr;
268 iov[1].iov_base = 0;
269 iov[1].iov_len = 0;
270 if (sb->sb_wptr < sb->sb_rptr)
271 {
272 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
273 /* Should never succeed, but... */
274 if (iov[0].iov_len > len)
275 iov[0].iov_len = len;
276 if (iov[0].iov_len > mss)
277 iov[0].iov_len -= iov[0].iov_len%mss;
278 n = 1;
279 }
280 else
281 {
282 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
283 /* Should never succeed, but... */
284 if (iov[0].iov_len > len)
285 iov[0].iov_len = len;
286 len -= iov[0].iov_len;
287 if (len)
288 {
289 iov[1].iov_base = sb->sb_data;
290 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
291 if (iov[1].iov_len > len)
292 iov[1].iov_len = len;
293 total = iov[0].iov_len + iov[1].iov_len;
294 if (total > mss)
295 {
296 lss = total % mss;
297 if (iov[1].iov_len > lss)
298 {
299 iov[1].iov_len -= lss;
300 n = 2;
301 }
302 else
303 {
304 lss -= iov[1].iov_len;
305 iov[0].iov_len -= lss;
306 n = 1;
307 }
308 }
309 else
310 n = 2;
311 }
312 else
313 {
314 if (iov[0].iov_len > mss)
315 iov[0].iov_len -= iov[0].iov_len%mss;
316 n = 1;
317 }
318 }
319
320#ifdef HAVE_READV
321 nn = readv(so->s, (struct iovec *)iov, n);
322#else
323 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
324#endif
325 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
326 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
327 if (nn <= 0)
328 {
329 /*
330 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
331 * _could_ mean that the connection is closed. But we will receive an
332 * FD_CLOSE event later if the connection was _really_ closed. With
333 * www.youtube.com I see this very often. Closing the socket too early
334 * would be dangerous.
335 */
336 int status;
337 unsigned long pending = 0;
338 status = ioctlsocket(so->s, FIONREAD, &pending);
339 if (status < 0)
340 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
341 if (nn == 0 && (pending != 0))
342 {
343 SOCKET_UNLOCK(so);
344 STAM_PROFILE_STOP(&pData->StatIOread, a);
345 return 0;
346 }
347 if ( nn < 0
348 && soIgnorableErrorCode(errno))
349 {
350 SOCKET_UNLOCK(so);
351 STAM_PROFILE_STOP(&pData->StatIOread, a);
352 return 0;
353 }
354 else
355 {
356 int fUninitiolizedTemplate = 0;
357 fUninitiolizedTemplate = RT_BOOL(( sototcpcb(so)
358 && ( sototcpcb(so)->t_template.ti_src.s_addr == INADDR_ANY
359 || sototcpcb(so)->t_template.ti_dst.s_addr == INADDR_ANY)));
360 /* nn == 0 means peer has performed an orderly shutdown */
361 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
362 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
363 sofcantrcvmore(so);
364 if (!fUninitiolizedTemplate)
365 tcp_sockclosed(pData, sototcpcb(so));
366 else
367 tcp_drop(pData, sototcpcb(so), errno);
368 SOCKET_UNLOCK(so);
369 STAM_PROFILE_STOP(&pData->StatIOread, a);
370 return -1;
371 }
372 }
373 STAM_STATS(
374 if (n == 1)
375 {
376 STAM_COUNTER_INC(&pData->StatIORead_in_1);
377 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
378 }
379 else
380 {
381 STAM_COUNTER_INC(&pData->StatIORead_in_2);
382 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
383 }
384 );
385
386#ifndef HAVE_READV
387 /*
388 * If there was no error, try and read the second time round
389 * We read again if n = 2 (ie, there's another part of the buffer)
390 * and we read as much as we could in the first read
391 * We don't test for <= 0 this time, because there legitimately
392 * might not be any more data (since the socket is non-blocking),
393 * a close will be detected on next iteration.
394 * A return of -1 wont (shouldn't) happen, since it didn't happen above
395 */
396 if (n == 2 && nn == iov[0].iov_len)
397 {
398 int ret;
399 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
400 if (ret > 0)
401 nn += ret;
402 STAM_STATS(
403 if (ret > 0)
404 {
405 STAM_COUNTER_INC(&pData->StatIORead_in_2);
406 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
407 }
408 );
409 }
410
411 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
412#endif
413
414 /* Update fields */
415 sb->sb_cc += nn;
416 sb->sb_wptr += nn;
417 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
418 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
419 {
420 sb->sb_wptr -= sb->sb_datalen;
421 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
422 }
423 STAM_PROFILE_STOP(&pData->StatIOread, a);
424 SOCKET_UNLOCK(so);
425 return nn;
426}
427#else /* VBOX_WITH_SLIRP_BSD_SBUF */
428int
429soread(PNATState pData, struct socket *so)
430{
431 int n;
432 char *buf;
433 struct sbuf *sb = &so->so_snd;
434 size_t len = sbspace(sb);
435 int mss = so->so_tcpcb->t_maxseg;
436
437 STAM_PROFILE_START(&pData->StatIOread, a);
438 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
439 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
440
441 QSOCKET_LOCK(tcb);
442 SOCKET_LOCK(so);
443 QSOCKET_UNLOCK(tcb);
444
445 LogFlowFunc(("soread: so = %lx\n", (long)so));
446
447 if (len > mss)
448 len -= len % mss;
449 buf = RTMemAlloc(len);
450 if (buf == NULL)
451 {
452 Log(("NAT: can't alloc enough memory\n"));
453 return -1;
454 }
455
456 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
457 if (n <= 0)
458 {
459 /*
460 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
461 * _could_ mean that the connection is closed. But we will receive an
462 * FD_CLOSE event later if the connection was _really_ closed. With
463 * www.youtube.com I see this very often. Closing the socket too early
464 * would be dangerous.
465 */
466 int status;
467 unsigned long pending = 0;
468 status = ioctlsocket(so->s, FIONREAD, &pending);
469 if (status < 0)
470 Log(("NAT:error in WSAIoctl: %d\n", errno));
471 if (n == 0 && (pending != 0))
472 {
473 SOCKET_UNLOCK(so);
474 STAM_PROFILE_STOP(&pData->StatIOread, a);
475 RTMemFree(buf);
476 return 0;
477 }
478 if ( n < 0
479 && soIgnorableErrorCode(errno))
480 {
481 SOCKET_UNLOCK(so);
482 STAM_PROFILE_STOP(&pData->StatIOread, a);
483 RTMemFree(buf);
484 return 0;
485 }
486 else
487 {
488 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
489 n, errno, strerror(errno)));
490 sofcantrcvmore(so);
491 tcp_sockclosed(pData, sototcpcb(so));
492 SOCKET_UNLOCK(so);
493 STAM_PROFILE_STOP(&pData->StatIOread, a);
494 RTMemFree(buf);
495 return -1;
496 }
497 }
498
499 sbuf_bcat(sb, buf, n);
500 RTMemFree(buf);
501 return n;
502}
503#endif
504
505/*
506 * Get urgent data
507 *
508 * When the socket is created, we set it SO_OOBINLINE,
509 * so when OOB data arrives, we soread() it and everything
510 * in the send buffer is sent as urgent data
511 */
512void
513sorecvoob(PNATState pData, struct socket *so)
514{
515 struct tcpcb *tp = sototcpcb(so);
516 ssize_t ret;
517
518 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
519
520 /*
521 * We take a guess at how much urgent data has arrived.
522 * In most situations, when urgent data arrives, the next
523 * read() should get all the urgent data. This guess will
524 * be wrong however if more data arrives just after the
525 * urgent data, or the read() doesn't return all the
526 * urgent data.
527 */
528 ret = soread(pData, so);
529 if (RT_LIKELY(ret > 0))
530 {
531 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
532 tp->t_force = 1;
533 tcp_output(pData, tp);
534 tp->t_force = 0;
535 }
536}
537#ifndef VBOX_WITH_SLIRP_BSD_SBUF
538/*
539 * Send urgent data
540 * There's a lot duplicated code here, but...
541 */
542int
543sosendoob(struct socket *so)
544{
545 struct sbuf *sb = &so->so_rcv;
546 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
547
548 int n, len;
549
550 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
551
552 if (so->so_urgc > sizeof(buff))
553 so->so_urgc = sizeof(buff); /* XXX */
554
555 if (sb->sb_rptr < sb->sb_wptr)
556 {
557 /* We can send it directly */
558 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
559 so->so_urgc -= n;
560
561 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
562 n, so->so_urgc));
563 }
564 else
565 {
566 /*
567 * Since there's no sendv or sendtov like writev,
568 * we must copy all data to a linear buffer then
569 * send it all
570 */
571 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
572 if (len > so->so_urgc)
573 len = so->so_urgc;
574 memcpy(buff, sb->sb_rptr, len);
575 so->so_urgc -= len;
576 if (so->so_urgc)
577 {
578 n = sb->sb_wptr - sb->sb_data;
579 if (n > so->so_urgc)
580 n = so->so_urgc;
581 memcpy(buff + len, sb->sb_data, n);
582 so->so_urgc -= n;
583 len += n;
584 }
585 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
586#ifdef DEBUG
587 if (n != len)
588 Log(("Didn't send all data urgently XXXXX\n"));
589#endif
590 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
591 n, so->so_urgc));
592 }
593
594 sb->sb_cc -= n;
595 sb->sb_rptr += n;
596 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
597 sb->sb_rptr -= sb->sb_datalen;
598
599 return n;
600}
601
602/*
603 * Write data from so_rcv to so's socket,
604 * updating all sbuf field as necessary
605 */
606int
607sowrite(PNATState pData, struct socket *so)
608{
609 int n, nn;
610 struct sbuf *sb = &so->so_rcv;
611 size_t len = sb->sb_cc;
612 struct iovec iov[2];
613
614 STAM_PROFILE_START(&pData->StatIOwrite, a);
615 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
616 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
617 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
618 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
619 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
620 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
621 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
622 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
623 LogFlowFunc(("so = %R[natsock]\n", so));
624 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
625 QSOCKET_LOCK(tcb);
626 SOCKET_LOCK(so);
627 QSOCKET_UNLOCK(tcb);
628 if (so->so_urgc)
629 {
630 sosendoob(so);
631 if (sb->sb_cc == 0)
632 {
633 SOCKET_UNLOCK(so);
634 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
635 return 0;
636 }
637 }
638
639 /*
640 * No need to check if there's something to write,
641 * sowrite wouldn't have been called otherwise
642 */
643
644 len = sb->sb_cc;
645
646 iov[0].iov_base = sb->sb_rptr;
647 iov[1].iov_base = 0;
648 iov[1].iov_len = 0;
649 if (sb->sb_rptr < sb->sb_wptr)
650 {
651 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
652 /* Should never succeed, but... */
653 if (iov[0].iov_len > len)
654 iov[0].iov_len = len;
655 n = 1;
656 }
657 else
658 {
659 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
660 if (iov[0].iov_len > len)
661 iov[0].iov_len = len;
662 len -= iov[0].iov_len;
663 if (len)
664 {
665 iov[1].iov_base = sb->sb_data;
666 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
667 if (iov[1].iov_len > len)
668 iov[1].iov_len = len;
669 n = 2;
670 }
671 else
672 n = 1;
673 }
674 STAM_STATS({
675 if (n == 1)
676 {
677 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
678 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
679 }
680 else
681 {
682 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
683 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
684 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
685 }
686 });
687 /* Check if there's urgent data to send, and if so, send it */
688#ifdef HAVE_READV
689 nn = writev(so->s, (const struct iovec *)iov, n);
690#else
691 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
692#endif
693 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
694 /* This should never happen, but people tell me it does *shrug* */
695 if ( nn < 0
696 && soIgnorableErrorCode(errno))
697 {
698 SOCKET_UNLOCK(so);
699 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
700 return 0;
701 }
702
703 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
704 {
705 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
706 __PRETTY_FUNCTION__, so->so_state, errno));
707 sofcantsendmore(so);
708 tcp_sockclosed(pData, sototcpcb(so));
709 SOCKET_UNLOCK(so);
710 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
711 return -1;
712 }
713
714#ifndef HAVE_READV
715 if (n == 2 && nn == iov[0].iov_len)
716 {
717 int ret;
718 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
719 if (ret > 0)
720 nn += ret;
721 STAM_STATS({
722 if (ret > 0 && ret != iov[1].iov_len)
723 {
724 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
725 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
726 }
727 });
728 }
729 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
730#endif
731
732 /* Update sbuf */
733 sb->sb_cc -= nn;
734 sb->sb_rptr += nn;
735 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
736 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
737 {
738 sb->sb_rptr -= sb->sb_datalen;
739 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
740 }
741
742 /*
743 * If in DRAIN mode, and there's no more data, set
744 * it CANTSENDMORE
745 */
746 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
747 sofcantsendmore(so);
748
749 SOCKET_UNLOCK(so);
750 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
751 return nn;
752}
753#else /* VBOX_WITH_SLIRP_BSD_SBUF */
754static int
755do_sosend(struct socket *so, int fUrg)
756{
757 struct sbuf *sb = &so->so_rcv;
758
759 int n, len;
760
761 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
762
763 len = sbuf_len(sb);
764
765 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
766 if (n < 0)
767 Log(("NAT: Can't sent sbuf via socket.\n"));
768 if (fUrg)
769 so->so_urgc -= n;
770 if (n > 0 && n < len)
771 {
772 char *ptr;
773 char *buff;
774 buff = RTMemAlloc(len);
775 if (buff == NULL)
776 {
777 Log(("NAT: No space to allocate temporal buffer\n"));
778 return -1;
779 }
780 ptr = sbuf_data(sb);
781 memcpy(buff, &ptr[n], len - n);
782 sbuf_bcpy(sb, buff, len - n);
783 RTMemFree(buff);
784 return n;
785 }
786 sbuf_clear(sb);
787 return n;
788}
789int
790sosendoob(struct socket *so)
791{
792 return do_sosend(so, 1);
793}
794
795/*
796 * Write data from so_rcv to so's socket,
797 * updating all sbuf field as necessary
798 */
799int
800sowrite(PNATState pData, struct socket *so)
801{
802 return do_sosend(so, 0);
803}
804#endif
805
806/*
807 * recvfrom() a UDP socket
808 */
809void
810sorecvfrom(PNATState pData, struct socket *so)
811{
812 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
813
814 if (so->so_type == IPPROTO_ICMP)
815 {
816 /* This is a "ping" reply */
817#ifdef RT_OS_WINDOWS
818 sorecvfrom_icmp_win(pData, so);
819#else /* RT_OS_WINDOWS */
820 sorecvfrom_icmp_unix(pData, so);
821#endif /* !RT_OS_WINDOWS */
822 udp_detach(pData, so);
823 }
824 else
825 {
826 static uint8_t au8Buf[64 * 1024];
827
828 /* A "normal" UDP packet */
829 struct sockaddr_in addr;
830 socklen_t addrlen = sizeof(struct sockaddr_in);
831 struct iovec iov[2];
832 ssize_t nread;
833 struct mbuf *m;
834
835 QSOCKET_LOCK(udb);
836 SOCKET_LOCK(so);
837 QSOCKET_UNLOCK(udb);
838
839 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
840 if (m == NULL)
841 {
842 SOCKET_UNLOCK(so);
843 return;
844 }
845
846 m->m_data += ETH_HLEN;
847 m->m_pkthdr.header = mtod(m, void *);
848
849 m->m_data += sizeof(struct udpiphdr);
850
851 /* small packets will fit without copying */
852 iov[0].iov_base = mtod(m, char *);
853 iov[0].iov_len = M_TRAILINGSPACE(m);
854
855 /* large packets will spill into a temp buffer */
856 iov[1].iov_base = au8Buf;
857 iov[1].iov_len = sizeof(au8Buf);
858
859#if !defined(RT_OS_WINDOWS)
860 {
861 struct msghdr mh;
862 memset(&mh, 0, sizeof(mh));
863
864 mh.msg_iov = iov;
865 mh.msg_iovlen = 2;
866 mh.msg_name = &addr;
867 mh.msg_namelen = addrlen;
868
869 nread = recvmsg(so->s, &mh, 0);
870 }
871#else /* RT_OS_WINDOWS */
872 {
873 DWORD nbytes; /* NB: can't use nread b/c of different size */
874 DWORD flags;
875 int status;
876
877 flags = 0;
878 status = WSARecvFrom(so->s, iov, 2, &nbytes, &flags,
879 (struct sockaddr *)&addr, &addrlen,
880 NULL, NULL);
881 nread = (status != SOCKET_ERROR) ? nbytes : -1;
882 }
883#endif
884 if (nread >= 0)
885 {
886 if (nread <= iov[0].iov_len)
887 m->m_len = nread;
888 else
889 {
890 m->m_len = iov[0].iov_len;
891 m_append(pData, m, nread - iov[0].iov_len, iov[1].iov_base);
892 }
893 Assert((m_length(m, NULL) == nread));
894
895 /*
896 * Hack: domain name lookup will be used the most for UDP,
897 * and since they'll only be used once there's no need
898 * for the 4 minute (or whatever) timeout... So we time them
899 * out much quicker (10 seconds for now...)
900 */
901 if (so->so_expire)
902 {
903 if (so->so_fport != RT_H2N_U16_C(53))
904 so->so_expire = curtime + SO_EXPIRE;
905 }
906
907 /*
908 * last argument should be changed if Slirp will inject IP attributes
909 * Note: Here we can't check if dnsproxy's sent initial request
910 */
911 if ( pData->fUseDnsProxy
912 && so->so_fport == RT_H2N_U16_C(53))
913 dnsproxy_answer(pData, so, m);
914
915 /* packets definetly will be fragmented, could confuse receiver peer. */
916 if (nread > if_mtu)
917 m->m_flags |= M_SKIP_FIREWALL;
918
919 /*
920 * If this packet was destined for CTL_ADDR,
921 * make it look like that's where it came from, done by udp_output
922 */
923 udp_output(pData, so, m, &addr);
924 }
925 else
926 {
927 m_freem(pData, m);
928 so->so_m = NULL;
929
930 if (!soIgnorableErrorCode(errno))
931 {
932 u_char code;
933 if (errno == EHOSTUNREACH)
934 code = ICMP_UNREACH_HOST;
935 else if (errno == ENETUNREACH)
936 code = ICMP_UNREACH_NET;
937 else
938 code = ICMP_UNREACH_PORT;
939
940 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
941 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
942 }
943 }
944
945 SOCKET_UNLOCK(so);
946 }
947}
948
949/*
950 * sendto() a socket
951 */
952int
953sosendto(PNATState pData, struct socket *so, struct mbuf *m)
954{
955 int ret;
956 struct sockaddr_in *paddr;
957 struct sockaddr addr;
958#if 0
959 struct sockaddr_in host_addr;
960#endif
961 caddr_t buf = 0;
962 int mlen;
963
964 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
965
966 memset(&addr, 0, sizeof(struct sockaddr));
967#ifdef RT_OS_DARWIN
968 addr.sa_len = sizeof(struct sockaddr_in);
969#endif
970 paddr = (struct sockaddr_in *)&addr;
971 paddr->sin_family = AF_INET;
972 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
973 {
974 /* It's an alias */
975 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
976 switch(last_byte)
977 {
978#if 0
979 /* handle this case at 'default:' */
980 case CTL_BROADCAST:
981 addr.sin_addr.s_addr = INADDR_BROADCAST;
982 /* Send the packet to host to fully emulate broadcast */
983 /** @todo r=klaus: on Linux host this causes the host to receive
984 * the packet twice for some reason. And I cannot find any place
985 * in the man pages which states that sending a broadcast does not
986 * reach the host itself. */
987 host_addr.sin_family = AF_INET;
988 host_addr.sin_port = so->so_fport;
989 host_addr.sin_addr = our_addr;
990 sendto(so->s, m->m_data, m->m_len, 0,
991 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
992 break;
993#endif
994 case CTL_DNS:
995 case CTL_ALIAS:
996 default:
997 if (last_byte == ~pData->netmask)
998 paddr->sin_addr.s_addr = INADDR_BROADCAST;
999 else
1000 paddr->sin_addr = loopback_addr;
1001 break;
1002 }
1003 }
1004 else
1005 paddr->sin_addr = so->so_faddr;
1006 paddr->sin_port = so->so_fport;
1007
1008 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1009 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1010
1011 /* Don't care what port we get */
1012 /*
1013 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1014 * generates bodyless messages, annoying memmory management system.
1015 */
1016 mlen = m_length(m, NULL);
1017 if (mlen > 0)
1018 {
1019 buf = RTMemAlloc(mlen);
1020 if (buf == NULL)
1021 {
1022 return -1;
1023 }
1024 m_copydata(m, 0, mlen, buf);
1025 }
1026 ret = sendto(so->s, buf, mlen, 0,
1027 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1028#ifdef VBOX_WITH_NAT_SEND2HOME
1029 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1030 {
1031 slirpSend2Home(pData, so, buf, mlen, 0);
1032 }
1033#endif
1034 if (buf)
1035 RTMemFree(buf);
1036 if (ret < 0)
1037 {
1038 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1039 return -1;
1040 }
1041
1042 /*
1043 * Kill the socket if there's no reply in 4 minutes,
1044 * but only if it's an expirable socket
1045 */
1046 if (so->so_expire)
1047 so->so_expire = curtime + SO_EXPIRE;
1048 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1049 return 0;
1050}
1051
1052/*
1053 * XXX This should really be tcp_listen
1054 */
1055struct socket *
1056solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1057{
1058 struct sockaddr_in addr;
1059 struct socket *so;
1060 socklen_t addrlen = sizeof(addr);
1061 int s, opt = 1;
1062 int status;
1063
1064 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1065
1066 if ((so = socreate()) == NULL)
1067 {
1068 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1069 return NULL;
1070 }
1071
1072 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1073 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1074 {
1075 RTMemFree(so);
1076 return NULL;
1077 }
1078
1079 SOCKET_LOCK_CREATE(so);
1080 SOCKET_LOCK(so);
1081 QSOCKET_LOCK(tcb);
1082 insque(pData, so,&tcb);
1083 NSOCK_INC();
1084 QSOCKET_UNLOCK(tcb);
1085
1086 /*
1087 * SS_FACCEPTONCE sockets must time out.
1088 */
1089 if (flags & SS_FACCEPTONCE)
1090 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1091
1092 so->so_state = (SS_FACCEPTCONN|flags);
1093 so->so_lport = lport; /* Kept in network format */
1094 so->so_laddr.s_addr = laddr; /* Ditto */
1095
1096 memset(&addr, 0, sizeof(addr));
1097#ifdef RT_OS_DARWIN
1098 addr.sin_len = sizeof(addr);
1099#endif
1100 addr.sin_family = AF_INET;
1101 addr.sin_addr.s_addr = bind_addr;
1102 addr.sin_port = port;
1103
1104 /**
1105 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1106 * kernel will choose the optimal value for requests queue length.
1107 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1108 */
1109 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1110 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1111 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1112 || (listen(s, pData->soMaxConn) < 0))
1113 {
1114#ifdef RT_OS_WINDOWS
1115 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1116 closesocket(s);
1117 QSOCKET_LOCK(tcb);
1118 sofree(pData, so);
1119 QSOCKET_UNLOCK(tcb);
1120 /* Restore the real errno */
1121 WSASetLastError(tmperrno);
1122#else
1123 int tmperrno = errno; /* Don't clobber the real reason we failed */
1124 close(s);
1125 if (sototcpcb(so))
1126 tcp_close(pData, sototcpcb(so));
1127 else
1128 sofree(pData, so);
1129 /* Restore the real errno */
1130 errno = tmperrno;
1131#endif
1132 return NULL;
1133 }
1134 fd_nonblock(s);
1135 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1136
1137 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1138 so->so_fport = addr.sin_port;
1139 /* set socket buffers */
1140 opt = pData->socket_rcv;
1141 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1142 if (status < 0)
1143 {
1144 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1145 goto no_sockopt;
1146 }
1147 opt = pData->socket_snd;
1148 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1149 if (status < 0)
1150 {
1151 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1152 goto no_sockopt;
1153 }
1154no_sockopt:
1155 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1156 so->so_faddr = alias_addr;
1157 else
1158 so->so_faddr = addr.sin_addr;
1159
1160 so->s = s;
1161 SOCKET_UNLOCK(so);
1162 return so;
1163}
1164
1165/*
1166 * Data is available in so_rcv
1167 * Just write() the data to the socket
1168 * XXX not yet...
1169 * @todo do we really need this function, what it's intended to do?
1170 */
1171void
1172sorwakeup(struct socket *so)
1173{
1174 NOREF(so);
1175#if 0
1176 sowrite(so);
1177 FD_CLR(so->s,&writefds);
1178#endif
1179}
1180
1181/*
1182 * Data has been freed in so_snd
1183 * We have room for a read() if we want to
1184 * For now, don't read, it'll be done in the main loop
1185 */
1186void
1187sowwakeup(struct socket *so)
1188{
1189 NOREF(so);
1190}
1191
1192/*
1193 * Various session state calls
1194 * XXX Should be #define's
1195 * The socket state stuff needs work, these often get call 2 or 3
1196 * times each when only 1 was needed
1197 */
1198void
1199soisfconnecting(struct socket *so)
1200{
1201 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1202 SS_FCANTSENDMORE|SS_FWDRAIN);
1203 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1204}
1205
1206void
1207soisfconnected(struct socket *so)
1208{
1209 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1210 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1211 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1212 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1213}
1214
1215void
1216sofcantrcvmore(struct socket *so)
1217{
1218 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1219 if ((so->so_state & SS_NOFDREF) == 0)
1220 {
1221 shutdown(so->s, 0);
1222 }
1223 so->so_state &= ~(SS_ISFCONNECTING);
1224 if (so->so_state & SS_FCANTSENDMORE)
1225 so->so_state = SS_NOFDREF; /* Don't select it */
1226 /* XXX close() here as well? */
1227 else
1228 so->so_state |= SS_FCANTRCVMORE;
1229 LogFlowFuncLeave();
1230}
1231
1232void
1233sofcantsendmore(struct socket *so)
1234{
1235 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1236 if ((so->so_state & SS_NOFDREF) == 0)
1237 shutdown(so->s, 1); /* send FIN to fhost */
1238
1239 so->so_state &= ~(SS_ISFCONNECTING);
1240 if (so->so_state & SS_FCANTRCVMORE)
1241 so->so_state = SS_NOFDREF; /* as above */
1242 else
1243 so->so_state |= SS_FCANTSENDMORE;
1244 LogFlowFuncLeave();
1245}
1246
1247void
1248soisfdisconnected(struct socket *so)
1249{
1250 NOREF(so);
1251#if 0
1252 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1253 close(so->s);
1254 so->so_state = SS_ISFDISCONNECTED;
1255 /*
1256 * XXX Do nothing ... ?
1257 */
1258#endif
1259}
1260
1261/*
1262 * Set write drain mode
1263 * Set CANTSENDMORE once all data has been write()n
1264 */
1265void
1266sofwdrain(struct socket *so)
1267{
1268 if (SBUF_LEN(&so->so_rcv))
1269 so->so_state |= SS_FWDRAIN;
1270 else
1271 sofcantsendmore(so);
1272}
1273
1274static void
1275send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1276{
1277 struct ip *ip;
1278 uint32_t dst, src;
1279 char ip_copy[256];
1280 struct icmp *icp;
1281 int old_ip_len = 0;
1282 int hlen, original_hlen = 0;
1283 struct mbuf *m;
1284 struct icmp_msg *icm;
1285 uint8_t proto;
1286 int type = 0;
1287
1288 ip = (struct ip *)buff;
1289 /* Fix ip->ip_len to contain the total packet length including the header
1290 * in _host_ byte order for all OSes. On Darwin, that value already is in
1291 * host byte order. Solaris and Darwin report only the payload. */
1292#ifndef RT_OS_DARWIN
1293 ip->ip_len = RT_N2H_U16(ip->ip_len);
1294#endif
1295 hlen = (ip->ip_hl << 2);
1296#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1297 ip->ip_len += hlen;
1298#endif
1299 if (ip->ip_len < hlen + ICMP_MINLEN)
1300 {
1301 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1302 return;
1303 }
1304 icp = (struct icmp *)((char *)ip + hlen);
1305
1306 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1307 if ( icp->icmp_type != ICMP_ECHOREPLY
1308 && icp->icmp_type != ICMP_TIMXCEED
1309 && icp->icmp_type != ICMP_UNREACH)
1310 {
1311 return;
1312 }
1313
1314 /*
1315 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1316 * ICMP_ECHOREPLY assuming data 0
1317 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1318 */
1319 if (ip->ip_len < hlen + 8)
1320 {
1321 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1322 return;
1323 }
1324
1325 type = icp->icmp_type;
1326 if ( type == ICMP_TIMXCEED
1327 || type == ICMP_UNREACH)
1328 {
1329 /*
1330 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1331 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1332 */
1333 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1334 {
1335 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1336 return;
1337 }
1338 ip = &icp->icmp_ip;
1339 }
1340
1341 icm = icmp_find_original_mbuf(pData, ip);
1342 if (icm == NULL)
1343 {
1344 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1345 return;
1346 }
1347
1348 m = icm->im_m;
1349 if (!m)
1350 {
1351 LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so));
1352 LIST_REMOVE(icm, im_list);
1353 RTMemFree(icm);
1354 return;
1355 }
1356
1357 src = addr->sin_addr.s_addr;
1358 if (type == ICMP_ECHOREPLY)
1359 {
1360 struct ip *ip0 = mtod(m, struct ip *);
1361 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1362 if (icp0->icmp_type != ICMP_ECHO)
1363 {
1364 Log(("NAT: we haven't found echo for this reply\n"));
1365 return;
1366 }
1367 /*
1368 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1369 * IP header combined by OS network stack, our local copy of IP header contians values
1370 * in host byte order so no byte order conversion is required. IP headers fields are converting
1371 * in ip_output0 routine only.
1372 */
1373 if ( (ip->ip_len - hlen)
1374 != (ip0->ip_len - (ip0->ip_hl << 2)))
1375 {
1376 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1377 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1378 return;
1379 }
1380 }
1381
1382 /* ip points on origianal ip header */
1383 ip = mtod(m, struct ip *);
1384 proto = ip->ip_p;
1385 /* Now ip is pointing on header we've sent from guest */
1386 if ( icp->icmp_type == ICMP_TIMXCEED
1387 || icp->icmp_type == ICMP_UNREACH)
1388 {
1389 old_ip_len = (ip->ip_hl << 2) + 64;
1390 if (old_ip_len > sizeof(ip_copy))
1391 old_ip_len = sizeof(ip_copy);
1392 memcpy(ip_copy, ip, old_ip_len);
1393 }
1394
1395 /* source address from original IP packet*/
1396 dst = ip->ip_src.s_addr;
1397
1398 /* overide ther tail of old packet */
1399 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1400 original_hlen = ip->ip_hl << 2;
1401 /* saves original ip header and options */
1402 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1403 ip->ip_len = m_length(m, NULL);
1404 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1405
1406 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1407 type = icp->icmp_type;
1408 if ( type == ICMP_TIMXCEED
1409 || type == ICMP_UNREACH)
1410 {
1411 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1412 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1413 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1414 }
1415
1416 ip->ip_src.s_addr = src;
1417 ip->ip_dst.s_addr = dst;
1418 icmp_reflect(pData, m);
1419 LIST_REMOVE(icm, im_list);
1420 pData->cIcmpCacheSize--;
1421 /* Don't call m_free here*/
1422
1423 if ( type == ICMP_TIMXCEED
1424 || type == ICMP_UNREACH)
1425 {
1426 icm->im_so->so_m = NULL;
1427 switch (proto)
1428 {
1429 case IPPROTO_UDP:
1430 /*XXX: so->so_m already freed so we shouldn't call sofree */
1431 udp_detach(pData, icm->im_so);
1432 break;
1433 case IPPROTO_TCP:
1434 /*close tcp should be here */
1435 break;
1436 default:
1437 /* do nothing */
1438 break;
1439 }
1440 }
1441 RTMemFree(icm);
1442}
1443
1444#ifdef RT_OS_WINDOWS
1445static void
1446sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1447{
1448 int len;
1449 int i;
1450 struct ip *ip;
1451 struct mbuf *m;
1452 struct icmp *icp;
1453 struct icmp_msg *icm;
1454 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1455 uint32_t src;
1456 ICMP_ECHO_REPLY *icr;
1457 int hlen = 0;
1458 int nbytes = 0;
1459 u_char code = ~0;
1460 int out_len;
1461 int size;
1462
1463 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->cbIcmpBuffer);
1464 if (len < 0)
1465 {
1466 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1467 return;
1468 }
1469 if (len == 0)
1470 return; /* no error */
1471
1472 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1473 for (i = 0; i < len; ++i)
1474 {
1475 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1476 i, icr[i].Data, icr[i].DataSize));
1477 switch(icr[i].Status)
1478 {
1479 case IP_DEST_HOST_UNREACHABLE:
1480 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1481 case IP_DEST_NET_UNREACHABLE:
1482 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1483 case IP_DEST_PROT_UNREACHABLE:
1484 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1485 /* UNREACH error inject here */
1486 case IP_DEST_PORT_UNREACHABLE:
1487 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1488 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1489 so->so_m = NULL;
1490 break;
1491 case IP_SUCCESS: /* echo replied */
1492 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1493 size;
1494 size = MCLBYTES;
1495 if (out_len < MSIZE)
1496 size = MCLBYTES;
1497 else if (out_len < MCLBYTES)
1498 size = MCLBYTES;
1499 else if (out_len < MJUM9BYTES)
1500 size = MJUM9BYTES;
1501 else if (out_len < MJUM16BYTES)
1502 size = MJUM16BYTES;
1503 else
1504 AssertMsgFailed(("Unsupported size"));
1505
1506 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1507 LogFunc(("m_getjcl returns m: %p\n", m));
1508 if (m == NULL)
1509 return;
1510 m->m_len = 0;
1511 m->m_data += if_maxlinkhdr;
1512 m->m_pkthdr.header = mtod(m, void *);
1513
1514 ip = mtod(m, struct ip *);
1515 ip->ip_src.s_addr = icr[i].Address;
1516 ip->ip_p = IPPROTO_ICMP;
1517 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1518 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1519 ip->ip_ttl = icr[i].Options.Ttl;
1520
1521 icp = (struct icmp *)&ip[1]; /* no options */
1522 icp->icmp_type = ICMP_ECHOREPLY;
1523 icp->icmp_code = 0;
1524 icp->icmp_id = so->so_icmp_id;
1525 icp->icmp_seq = so->so_icmp_seq;
1526
1527 icm = icmp_find_original_mbuf(pData, ip);
1528 if (icm)
1529 {
1530 /* on this branch we don't need stored variant */
1531 m_freem(pData, icm->im_m);
1532 LIST_REMOVE(icm, im_list);
1533 pData->cIcmpCacheSize--;
1534 RTMemFree(icm);
1535 }
1536
1537
1538 hlen = (ip->ip_hl << 2);
1539 Assert((hlen >= sizeof(struct ip)));
1540
1541 m->m_data += hlen + ICMP_MINLEN;
1542 if (!RT_VALID_PTR(icr[i].Data))
1543 {
1544 m_freem(pData, m);
1545 break;
1546 }
1547 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1548 m->m_data -= hlen + ICMP_MINLEN;
1549 m->m_len += hlen + ICMP_MINLEN;
1550
1551
1552 ip->ip_len = m_length(m, NULL);
1553 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1554
1555 icmp_reflect(pData, m);
1556 break;
1557 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1558
1559 ip_broken = icr[i].Data;
1560 icm = icmp_find_original_mbuf(pData, ip_broken);
1561 if (icm == NULL) {
1562 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1563 return;
1564 }
1565 m = icm->im_m;
1566 ip = mtod(m, struct ip *);
1567 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1568 ip->ip_ttl = icr[i].Options.Ttl;
1569 src = ip->ip_src.s_addr;
1570 ip->ip_dst.s_addr = src;
1571 ip->ip_dst.s_addr = icr[i].Address;
1572
1573 hlen = (ip->ip_hl << 2);
1574 icp = (struct icmp *)((char *)ip + hlen);
1575 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1576
1577 m->m_len = (ip_broken->ip_hl << 2) + 64;
1578 m->m_pkthdr.header = mtod(m, void *);
1579 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1580 icmp_reflect(pData, m);
1581 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1582 LIST_REMOVE(icm, im_list);
1583 pData->cIcmpCacheSize--;
1584 RTMemFree(icm);
1585 break;
1586 default:
1587 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1588 break;
1589 }
1590 }
1591}
1592#else /* !RT_OS_WINDOWS */
1593static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1594{
1595 struct sockaddr_in addr;
1596 socklen_t addrlen = sizeof(struct sockaddr_in);
1597 struct ip ip;
1598 char *buff;
1599 int len = 0;
1600
1601 /* 1- step: read the ip header */
1602 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1603 (struct sockaddr *)&addr, &addrlen);
1604 if ( len < 0
1605 && ( soIgnorableErrorCode(errno)
1606 || errno == ENOTCONN))
1607 {
1608 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1609 return;
1610 }
1611
1612 if ( len < sizeof(struct ip)
1613 || len < 0
1614 || len == 0)
1615 {
1616 u_char code;
1617 code = ICMP_UNREACH_PORT;
1618
1619 if (errno == EHOSTUNREACH)
1620 code = ICMP_UNREACH_HOST;
1621 else if (errno == ENETUNREACH)
1622 code = ICMP_UNREACH_NET;
1623
1624 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1625 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1626 so->so_m = NULL;
1627 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1628 return;
1629 }
1630 /* basic check of IP header */
1631 if ( ip.ip_v != IPVERSION
1632# ifndef RT_OS_DARWIN
1633 || ip.ip_p != IPPROTO_ICMP
1634# endif
1635 )
1636 {
1637 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1638 return;
1639 }
1640# ifndef RT_OS_DARWIN
1641 /* Darwin reports the IP length already in host byte order. */
1642 ip.ip_len = RT_N2H_U16(ip.ip_len);
1643# endif
1644# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1645 /* Solaris and Darwin report the payload only */
1646 ip.ip_len += (ip.ip_hl << 2);
1647# endif
1648 /* Note: ip->ip_len in host byte order (all OS) */
1649 len = ip.ip_len;
1650 buff = RTMemAlloc(len);
1651 if (buff == NULL)
1652 {
1653 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1654 return;
1655 }
1656 /* 2 - step: we're reading rest of the datagramm to the buffer */
1657 addrlen = sizeof(struct sockaddr_in);
1658 memset(&addr, 0, addrlen);
1659 len = recvfrom(so->s, buff, len, 0,
1660 (struct sockaddr *)&addr, &addrlen);
1661 if ( len < 0
1662 && ( soIgnorableErrorCode(errno)
1663 || errno == ENOTCONN))
1664 {
1665 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1666 ip.ip_len));
1667 RTMemFree(buff);
1668 return;
1669 }
1670 if ( len < 0
1671 || len == 0)
1672 {
1673 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1674 errno, len, (ip.ip_len - sizeof(struct ip))));
1675 RTMemFree(buff);
1676 return;
1677 }
1678 /* len is modified in 2nd read, when the rest of the datagramm was read */
1679 send_icmp_to_guest(pData, buff, len, &addr);
1680 RTMemFree(buff);
1681}
1682#endif /* !RT_OS_WINDOWS */
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette