#include #include #include #include #include #include #include #include #include #include #include #include #include static uint16_t tfe_get_ipid() { static __thread uint16_t ipid = 0; if (ipid == 0) { ipid = random(); } else { ipid++; } return ipid; } void tfe_tcp_restore_info_dump(const struct tcp_restore_info *info) { char str_client_addr[64] = {0}; char str_server_addr[64] = {0}; const struct tcp_restore_endpoint *client = &info->client; const struct tcp_restore_endpoint *server = &info->server; assert(client->addr.ss_family == server->addr.ss_family); if (client->addr.ss_family == AF_INET) { struct sockaddr_in *sk_client = (struct sockaddr_in *)&client->addr; struct sockaddr_in *sk_server = (struct sockaddr_in *)&server->addr; uint16_t port_client = ntohs(sk_client->sin_port); uint16_t port_server = ntohs(sk_server->sin_port); inet_ntop(AF_INET, &sk_client->sin_addr, str_client_addr, sizeof(str_client_addr)); inet_ntop(AF_INET, &sk_server->sin_addr, str_server_addr, sizeof(str_client_addr)); TFE_LOG_DEBUG(g_default_logger, "tcp_restore_info %p: cur_dir=%u, %s:%hu->%s:%hu, seq=%u, ack=%u, " "client={ mss=%u, wscale_perm=%u, wscale=%u, ts=%u, sack=%u }, " "server={ mss=%u, wscale_perm=%u, wscale=%u, ts=%u, sack=%u }", info, info->cur_dir, str_client_addr, port_client, str_server_addr, port_server, info->client.seq, info->client.ack, client->mss, (client->wscale_perm ? 1 : 0), client->wscale, (client->timestamp_perm ? 1 : 0), (client->sack_perm ? 1 : 0), server->mss, (server->wscale_perm ? 1 : 0), server->wscale, (server->timestamp_perm ? 1 : 0), (server->sack_perm ? 1 : 0)); } else if (client->addr.ss_family == AF_INET6) { struct sockaddr_in6 *sk_client = (struct sockaddr_in6 *)&client->addr; struct sockaddr_in6 *sk_server = (struct sockaddr_in6 *)&server->addr; uint16_t port_client = ntohs(sk_client->sin6_port); uint16_t port_server = ntohs(sk_server->sin6_port); inet_ntop(AF_INET6, &sk_client->sin6_addr, str_client_addr, sizeof(str_client_addr)); inet_ntop(AF_INET6, &sk_server->sin6_addr, str_server_addr, sizeof(str_client_addr)); TFE_LOG_DEBUG(g_default_logger, "tcp_restore_info %p: cur_dir=%u, %s:%hu->%s:%hu, seq=%u, ack=%u, " "client={ mss=%u, wscale_perm=%u, wscale=%u, ts=%u, sack=%u }, " "server={ mss=%u, wscale_perm=%u, wscale=%u, ts=%u, sack=%u }", info, info->cur_dir, str_client_addr, port_client, str_server_addr, port_server, info->client.seq, info->client.ack, client->mss, (client->wscale_perm ? 1 : 0), client->wscale, (client->timestamp_perm ? 1 : 0), (client->sack_perm ? 1 : 0), server->mss, (server->wscale_perm ? 1 : 0), server->wscale, (server->timestamp_perm ? 1 : 0), (server->sack_perm ? 1 : 0)); } } int tfe_tcp_restore_fd_create(const struct tcp_restore_endpoint *endpoint, const struct tcp_restore_endpoint *peer, const char *devname, unsigned int fd_so_mask) { int result = 0; int sockopt = 0; int sockfd = 0; char buffer[IFNAMSIZ] = {0}; socklen_t buffer_len = sizeof(buffer); unsigned int nr_tcp_repair_opts = 0; struct tcp_repair_opt tcp_repair_opts[8]; struct tcp_repair_window tcp_repair_window = {0}; if (endpoint->addr.ss_family == AF_INET) { sockfd = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); } else if (endpoint->addr.ss_family == AF_INET6) { sockfd = socket(AF_INET6, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); } else { errno = EINVAL; TFE_LOG_ERROR(g_default_logger, "failed at tcp_restore_fd_create(), %d: %s", errno, strerror(errno)); goto errout; } if (sockfd < 0) { TFE_LOG_ERROR(g_default_logger, "failed at socket(), %d: %s", errno, strerror(errno)); goto errout; } sockopt = fd_so_mask; result = setsockopt(sockfd, SOL_SOCKET, SO_MARK, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(SO_MARK), %d: %s", errno, strerror(errno)); goto errout; } if (strlen(devname)) { result = setsockopt(sockfd, SOL_SOCKET, SO_BINDTODEVICE, devname, strlen(devname)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(SO_BINDTODEVICE) on %s, %d: %s", devname, errno, strerror(errno)); goto errout; } result = getsockopt(sockfd, SOL_SOCKET, SO_BINDTODEVICE, buffer, &buffer_len); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at getsockopt(SO_BINDTODEVICE) on %s, %d: %s", devname, errno, strerror(errno)); goto errout; } TFE_LOG_DEBUG(g_default_logger, "sockfd %d successfully bound to %s device, so_mask: %x", sockfd, buffer, fd_so_mask); } // Setup TCP REPAIR Status sockopt = 1; result = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(SO_REUSEADDR), %d: %s", errno, strerror(errno)); goto errout; } sockopt = 1; result = setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(IP_TRANSPARENT), %d: %s", errno, strerror(errno)); goto errout; } sockopt = 1; result = setsockopt(sockfd, IPPROTO_TCP, TCP_REPAIR, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_REPAIR), %d: %s", errno, strerror(errno)); goto errout; } // Setup SEQ/ACK and TCP options sockopt = TCP_SEND_QUEUE; result = setsockopt(sockfd, IPPROTO_TCP, TCP_REPAIR_QUEUE, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_REPAIR_QUEUE), %d: %s", errno, strerror(errno)); goto errout; } sockopt = endpoint->seq; result = setsockopt(sockfd, IPPROTO_TCP, TCP_QUEUE_SEQ, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_QUEUE_SEQ), %d: %s", errno, strerror(errno)); goto errout; } sockopt = TCP_RECV_QUEUE; result = setsockopt(sockfd, IPPROTO_TCP, TCP_REPAIR_QUEUE, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_REPAIR_QUEUE), %d: %s", errno, strerror(errno)); goto errout; } sockopt = endpoint->ack; result = setsockopt(sockfd, IPPROTO_TCP, TCP_QUEUE_SEQ, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_QUEUE_SEQ), %d: %s", errno, strerror(errno)); goto errout; } #ifndef TCPOPT_MAXSEG #define TCPOPT_MAXSEG 2 #endif #ifndef TCPOPT_WINDOW #define TCPOPT_WINDOW 3 #endif #ifndef TCPOPT_SACK_PERMITTED #define TCPOPT_SACK_PERMITTED 4 #endif #ifndef TCPOPT_TIMESTAMP #define TCPOPT_TIMESTAMP 8 #endif tcp_repair_opts[nr_tcp_repair_opts].opt_code = TCPOPT_MAXSEG; tcp_repair_opts[nr_tcp_repair_opts].opt_val = MIN(endpoint->mss, peer->mss); nr_tcp_repair_opts++; if (endpoint->sack_perm && peer->sack_perm) { tcp_repair_opts[nr_tcp_repair_opts].opt_code = TCPOPT_SACK_PERMITTED; tcp_repair_opts[nr_tcp_repair_opts].opt_val = 0; nr_tcp_repair_opts++; } if (endpoint->wscale_perm && peer->wscale_perm) { tcp_repair_opts[nr_tcp_repair_opts].opt_code = TCPOPT_WINDOW; tcp_repair_opts[nr_tcp_repair_opts].opt_val = (endpoint->wscale << 16) | peer->wscale; nr_tcp_repair_opts++; } if (endpoint->timestamp_perm && peer->timestamp_perm) { tcp_repair_opts[nr_tcp_repair_opts].opt_code = TCPOPT_TIMESTAMP; tcp_repair_opts[nr_tcp_repair_opts].opt_val = 0; nr_tcp_repair_opts++; } // Bind address and connect to peer endpoint result = bind(sockfd, (struct sockaddr *)&endpoint->addr, sizeof(endpoint->addr)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at bind(), %d: %s", errno, strerror(errno)); goto errout; } result = connect(sockfd, (struct sockaddr *)&peer->addr, sizeof(peer->addr)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at connect(), %d: %s", errno, strerror(errno)); goto errout; } result = setsockopt(sockfd, IPPROTO_TCP, TCP_REPAIR_OPTIONS, (char *)tcp_repair_opts, nr_tcp_repair_opts * sizeof(struct tcp_repair_opt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_REPAIR_OPTIONS), %d: %s", errno, strerror(errno)); goto errout; } if (endpoint->timestamp_perm && peer->timestamp_perm) { result = setsockopt(sockfd, IPPROTO_TCP, TCP_TIMESTAMP, &(endpoint->ts_val), sizeof(endpoint->ts_val)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_TIMESTAMP), %d: %s", errno, strerror(errno)); goto errout; } } // Perpare Window Setup tcp_repair_window.snd_wl1 = peer->seq; tcp_repair_window.snd_wnd = peer->window; tcp_repair_window.max_window = peer->window; tcp_repair_window.rcv_wnd = endpoint->window; tcp_repair_window.rcv_wup = endpoint->ack; result = setsockopt(sockfd, IPPROTO_TCP, TCP_REPAIR_WINDOW, (char *)&tcp_repair_window, sizeof(tcp_repair_window)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_REPAIR_WINDOW), %d: %s", errno, strerror(errno)); goto errout; } sockopt = 0; result = setsockopt(sockfd, IPPROTO_TCP, TCP_REPAIR, (char *)&sockopt, sizeof(sockopt)); if (result < 0) { TFE_LOG_ERROR(g_default_logger, "failed at setsockopt(TCP_REPAIR), %d: %s", errno, strerror(errno)); goto errout; } return sockfd; errout: if (sockfd > 0) { close(sockfd); } return -1; } struct tcp_option_mss { uint8_t kind; uint8_t length; uint16_t mss_value; } __attribute__((__packed__)); struct tcp_option_window_scale { uint8_t kind; uint8_t length; uint8_t shift_count; } __attribute__((__packed__)); struct tcp_option_sack { uint8_t kind; uint8_t length; } __attribute__((__packed__)); struct tcp_option_time_stamp { uint8_t kind; uint8_t length; uint32_t tsval; uint32_t tsecr; } __attribute__((__packed__)); int tfe_tcp_restore_syn_packet(struct tcp_restore_info *restore_info, struct ether_addr *client_mac, struct ether_addr *server_mac, char *buffer, int size) { int length = 0; char tcp_option_buff[40] = {0}; int tcp_option_len = 0; const struct tcp_restore_endpoint *client = &restore_info->client; const struct tcp_restore_endpoint *server = &restore_info->server; uint32_t c_seq = client->seq - 1; /* * Maximum segment size: Kind: 2, Length: 4 * +---------+---------+---------+ * | Kind=2 |Length=4 |mss.value| * +---------+---------+---------+ * 1 1 2 */ if (client->mss) { struct tcp_option_mss *option = (struct tcp_option_mss *)(tcp_option_buff + tcp_option_len); option->kind = 2; option->length = 4; option->mss_value = htons(client->mss); tcp_option_len += sizeof(struct tcp_option_mss); } /* * Window Scale option: Kind: 3, Length: 3 * +---------+---------+---------+ * | Kind=3 |Length=3 |shift.cnt| * +---------+---------+---------+ * 1 1 1 */ if (client->wscale_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 1); tcp_option_len += 1; struct tcp_option_window_scale *option = (struct tcp_option_window_scale *)(tcp_option_buff + tcp_option_len); option->kind = 3; option->length = 3; option->shift_count = client->wscale; tcp_option_len += sizeof(struct tcp_option_window_scale); } /* * SACK option: Kind: 4, Length: 2 * +---------+---------+ * | Kind=4 |Length=2 | * +---------+---------+ * 1 1 */ if (client->sack_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 2); tcp_option_len += 2; struct tcp_option_sack *option = (struct tcp_option_sack *)(tcp_option_buff + tcp_option_len); option->kind = 4; option->length = 2; tcp_option_len += sizeof(struct tcp_option_sack); } /* * Time Stamp option: Kind: 8, Length: 10 * +---------+---------+-----+-----+ * | Kind=8 |Length=10|tsval|tsecr| * +---------+---------+-----+-----+ * 1 1 4 4 */ if (client->timestamp_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 2); tcp_option_len += 2; struct tcp_option_time_stamp *option = (struct tcp_option_time_stamp *)(tcp_option_buff + tcp_option_len); option->kind = 8; option->length = 10; option->tsval = htonl(client->ts_val); option->tsecr = htonl(0); tcp_option_len += sizeof(struct tcp_option_time_stamp); } if (client->addr.ss_family == AF_INET6) { struct sockaddr_in6 *sk_client = (struct sockaddr_in6 *)&client->addr; struct sockaddr_in6 *sk_server = (struct sockaddr_in6 *)&server->addr; uint16_t port_client = sk_client->sin6_port; uint16_t port_server = sk_server->sin6_port; // C -> S length = tcp_packet_v6_construct( buffer, // buffer client_mac, server_mac, 0, ETH_P_IPV6, // Ether &sk_client->sin6_addr, &sk_server->sin6_addr, TFE_FAKE_C_DEFAULT_TTL, // IPv6 port_client, port_server, c_seq, 0, TCP_SYN_FLAG, client->window, // TCP Header tcp_option_buff, tcp_option_len, // TCP Options NULL, 0); // Payload } else { struct sockaddr_in *sk_client = (struct sockaddr_in *)&client->addr; struct sockaddr_in *sk_server = (struct sockaddr_in *)&server->addr; uint16_t port_client = sk_client->sin_port; uint16_t port_server = sk_server->sin_port; // C -> S length = tcp_packet_v4_construct( buffer, // buffer client_mac, server_mac, 0, ETH_P_IP, // Ether &sk_client->sin_addr, &sk_server->sin_addr, 0, TFE_FAKE_C_DEFAULT_TTL, tfe_get_ipid(), // IPv4 port_client, port_server, c_seq, 0, TCP_SYN_FLAG, client->window, // TCP Header tcp_option_buff, tcp_option_len, // TCP Options NULL, 0); } return length; } int tfe_tcp_restore_synack_packet(struct tcp_restore_info *restore_info, struct ether_addr *client_mac, struct ether_addr *server_mac, char *buffer, int size) { int length = 0; char tcp_option_buff[40] = {0}; int tcp_option_len = 0; const struct tcp_restore_endpoint *client = &restore_info->client; const struct tcp_restore_endpoint *server = &restore_info->server; uint32_t c_seq = client->seq - 1; uint32_t s_seq = server->seq - 1; /* * Maximum segment size: Kind: 2, Length: 4 * +---------+---------+---------+ * | Kind=2 |Length=4 |mss.value| * +---------+---------+---------+ * 1 1 2 */ if (server->mss) { struct tcp_option_mss *option = (struct tcp_option_mss *)(tcp_option_buff + tcp_option_len); option->kind = 2; option->length = 4; option->mss_value = htons(server->mss); tcp_option_len += sizeof(struct tcp_option_mss); } /* * Window Scale option: Kind: 3, Length: 3 * +---------+---------+---------+ * | Kind=3 |Length=3 |shift.cnt| * +---------+---------+---------+ * 1 1 1 */ if (server->wscale_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 1); tcp_option_len += 1; struct tcp_option_window_scale *option = (struct tcp_option_window_scale *)(tcp_option_buff + tcp_option_len); option->kind = 3; option->length = 3; option->shift_count = server->wscale; tcp_option_len += sizeof(struct tcp_option_window_scale); } /* * SACK option: Kind: 4, Length: 2 * +---------+---------+ * | Kind=4 |Length=2 | * +---------+---------+ * 1 1 */ if (server->sack_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 2); tcp_option_len += 2; struct tcp_option_sack *option = (struct tcp_option_sack *)(tcp_option_buff + tcp_option_len); option->kind = 4; option->length = 2; tcp_option_len += sizeof(struct tcp_option_sack); } /* * Time Stamp option: Kind: 8, Length: 10 * +---------+---------+-----+-----+ * | Kind=8 |Length=10|tsval|tsecr| * +---------+---------+-----+-----+ * 1 1 4 4 */ if (server->timestamp_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 2); tcp_option_len += 2; struct tcp_option_time_stamp *option = (struct tcp_option_time_stamp *)(tcp_option_buff + tcp_option_len); option->kind = 8; option->length = 10; option->tsval = htonl(server->ts_val); option->tsecr = client->timestamp_perm ? htonl(client->ts_val) : htonl(0); tcp_option_len += sizeof(struct tcp_option_time_stamp); } if (client->addr.ss_family == AF_INET6) { struct sockaddr_in6 *sk_client = (struct sockaddr_in6 *)&client->addr; struct sockaddr_in6 *sk_server = (struct sockaddr_in6 *)&server->addr; uint16_t port_client = sk_client->sin6_port; uint16_t port_server = sk_server->sin6_port; c_seq += 1; // S -> C length = tcp_packet_v6_construct( buffer, // buffer server_mac, client_mac, 0, ETH_P_IPV6, // Ether &sk_server->sin6_addr, &sk_client->sin6_addr, TFE_FAKE_S_DEFAULT_TTL, // IPv6 port_server, port_client, s_seq, c_seq, TCP_SYN_FLAG | TCP_ACK_FLAG, server->window, // TCP Header tcp_option_buff, tcp_option_len, // TCP Options NULL, 0); // Payload } else { struct sockaddr_in *sk_client = (struct sockaddr_in *)&client->addr; struct sockaddr_in *sk_server = (struct sockaddr_in *)&server->addr; uint16_t port_client = sk_client->sin_port; uint16_t port_server = sk_server->sin_port; c_seq += 1; // S -> C length = tcp_packet_v4_construct( buffer, // buffer server_mac, client_mac, 0, ETH_P_IP, // Ether &sk_server->sin_addr, &sk_client->sin_addr, 0, TFE_FAKE_S_DEFAULT_TTL, tfe_get_ipid(), // IPv4 port_server, port_client, s_seq, c_seq, TCP_SYN_FLAG | TCP_ACK_FLAG, server->window, // TCP Header tcp_option_buff, tcp_option_len, // TCP Options NULL, 0); } return length; } int tfe_tcp_restore_ack_packet(struct tcp_restore_info *restore_info, struct ether_addr *client_mac, struct ether_addr *server_mac, char *buffer, int size) { int length = 0; char tcp_option_buff[40] = {0}; int tcp_option_len = 0; const struct tcp_restore_endpoint *client = &restore_info->client; const struct tcp_restore_endpoint *server = &restore_info->server; uint32_t c_seq = client->seq - 1; uint32_t s_seq = server->seq - 1; uint16_t ack_window = 0; /* * Time Stamp option: Kind: 8, Length: 10 * +---------+---------+-----+-----+ * | Kind=8 |Length=10|tsval|tsecr| * +---------+---------+-----+-----+ * 1 1 4 4 */ if (client->timestamp_perm && server->timestamp_perm) { // padding memset(tcp_option_buff + tcp_option_len, 1, 2); tcp_option_len += 2; struct tcp_option_time_stamp *option = (struct tcp_option_time_stamp *)(tcp_option_buff + tcp_option_len); option->kind = 8; option->length = 10; option->tsval = htonl(client->ts_val); option->tsecr = htonl(server->ts_val); tcp_option_len += sizeof(struct tcp_option_time_stamp); } ack_window = MIN(client->window, server->window); if (client->wscale_perm && server->wscale_perm) { ack_window = ack_window / (1 << server->wscale); } if (client->addr.ss_family == AF_INET6) { struct sockaddr_in6 *sk_client = (struct sockaddr_in6 *)&client->addr; struct sockaddr_in6 *sk_server = (struct sockaddr_in6 *)&server->addr; uint16_t port_client = sk_client->sin6_port; uint16_t port_server = sk_server->sin6_port; c_seq += 1; s_seq += 1; // C -> S length = tcp_packet_v6_construct( buffer, // buffer client_mac, server_mac, 0, ETH_P_IPV6, // Ether &sk_client->sin6_addr, &sk_server->sin6_addr, TFE_FAKE_C_DEFAULT_TTL, // IPv6 port_client, port_server, c_seq, s_seq, TCP_ACK_FLAG, ack_window, // TCP Header tcp_option_buff, tcp_option_len, // TCP Options NULL, 0); // Payload } else { struct sockaddr_in *sk_client = (struct sockaddr_in *)&client->addr; struct sockaddr_in *sk_server = (struct sockaddr_in *)&server->addr; uint16_t port_client = sk_client->sin_port; uint16_t port_server = sk_server->sin_port; c_seq += 1; s_seq += 1; // C -> S length = tcp_packet_v4_construct( buffer, // buffer client_mac, server_mac, 0, ETH_P_IP, // Ether &sk_client->sin_addr, &sk_server->sin_addr, 0, TFE_FAKE_C_DEFAULT_TTL, tfe_get_ipid(), // IPv4 port_client, port_server, c_seq, s_seq, TCP_ACK_FLAG, ack_window, // TCP Header tcp_option_buff, tcp_option_len, // TCP Options NULL, 0); } return length; }