WIP: eBPF XDP/TC series #3

Draft
baez90 wants to merge 1 commit from ebpf-xdp-tc-series into main
20 changed files with 1205 additions and 3 deletions
Showing only changes of commit 95918a26b2 - Show all commits

5
.gitignore vendored
View file

@ -13,4 +13,7 @@ hugo.linux
/.hugo_build.lock
# IntelliJ project files
.idea/
.idea/
# Vagrant files
.vagrant/

5
code/ebpf-xdp-tc/.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
*.o
CMakeLists.txt
out/
cmake-build-debug/

View file

@ -0,0 +1,7 @@
FROM docker.io/alpine:3.15
WORKDIR /app
COPY out/ebpf-xdp-tc ./
ENTRYPOINT ["/app/ebpf-xdp-tc"]

View file

@ -0,0 +1,58 @@
# https://taskfile.dev
version: '3'
tasks:
default:
build-ebpf:
cmds:
- mkdir -p ebpf/bin
- |
clang \
-Wno-unused-value \
-Wno-pointer-sign \
-Wno-compare-distinct-pointer-types \
-Wunused \
-Wall \
-fno-stack-protector \
-fno-ident \
-g \
-O2 -emit-llvm \
ebpf/main.c \
-c -o - | llc -march=bpf -mcpu=probe -filetype=obj -o ebpf/bin/probe.o
build-bin:
deps:
- build-ebpf
env:
GOOS: linux
GOARCH: amd64
cmds:
- mkdir -p out/
- go build -o out/ebpf-xdp-tc -trimpath -a -installsuffix=cgo -ldflags "-w -s -linkmode external -extldflags -static" ./
build-docker:
deps:
- build-bin
cmds:
- buildah bud -t ebpf-xdp-tc .
run-in-container:
deps:
- build-docker
cmds:
- |
podman run \
--rm \
-ti \
-v /sys:/sys:ro \
--security-opt=seccomp=unconfined \
--network=libvirt \
--ip "10.10.1.1" \
--name ebpf-xdp-tc \
--cap-add=CAP_SYS_ADMIN \
--cap-add=CAP_NET_RAW \
--cap-add=CAP_NET_BIND_SERVICE \
--cap-add=CAP_NET_ADMIN \
ebpf-xdp-tc

22
code/ebpf-xdp-tc/Vagrantfile vendored Normal file
View file

@ -0,0 +1,22 @@
Vagrant.configure("2") do |config|
# The most common configuration options are documented and commented below.
# For a complete reference, please see the online documentation at
# https://docs.vagrantup.com.
# Every Vagrant development environment requires a box. You can search for
# boxes at https://vagrantcloud.com/search.
config.vm.box = "peru/windows-10-enterprise-x64-eval"
config.vm.box_version = "20220202.01"
config.vm.provider "libvirt" do |libvirt|
libvirt.management_network_mode = 'veryisoled'
end
config.vm.define :win_victim do |win_victim|
win_victim.vm.network :private_network,
:libvirt__network_name => "containers"
end
config.vm.box_check_update = false
end

View file

@ -0,0 +1,53 @@
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include "types.h"
static inline unsigned short checksum(unsigned short *buf, int bufsz) {
unsigned long sum = 0;
while (bufsz > 1) {
sum += *buf;
buf++;
bufsz -= 2;
}
if (bufsz == 1) {
sum += *(unsigned char *) buf;
}
sum = (sum & 0xffff) + (sum >> 16);
sum = (sum & 0xffff) + (sum >> 16);
return ~sum;
}
static inline struct tcphdr *extract_tcp_meta(struct observed_packet *pkt, void *iph, __u64 off, void *data_end) {
struct tcphdr *hdr = iph + off;
if ((void *) hdr + sizeof(*hdr) > data_end) {
return NULL;
}
pkt->transport_proto = TCP;
pkt->sourcePort = bpf_ntohs(hdr->source);
pkt->destPort = bpf_ntohs(hdr->dest);
return hdr;
}
static inline struct udphdr *extract_udp_meta(struct observed_packet *pkt, void *iph, __u64 off, void *data_end) {
struct udphdr *hdr = iph + off;
if ((void *) hdr + sizeof(*hdr) > data_end) {
return NULL;
}
pkt->transport_proto = UDP;
pkt->sourcePort = bpf_ntohs(hdr->source);
pkt->destPort = bpf_ntohs(hdr->dest);
return hdr;
}

View file

@ -0,0 +1,222 @@
#include <linux/bpf.h>
#include <linux/pkt_cls.h>
#include "helpers.h"
#define IP_FRAGMENTED 65343
char LICENSE[] SEC("license") = "Dual MIT/GPL";
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
} perf_observed_packets SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 24);
} ring_observed_packets SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, sizeof(struct two_tuple));
__type(value, sizeof(struct two_tuple));
__uint(max_entries, 1024);
} conn_track SEC(".maps");
SEC("classifier/egress")
int egress(struct __sk_buff *skb) {
bpf_printk("new packet captured on egress (TC)\n");
void *data = (void *) (long) skb->data;
void *data_end = (void *) (long) skb->data_end;
struct ethhdr *eth = data;
if ((void *) eth + sizeof(*eth) > data_end) {
return TC_ACT_OK;
}
if(eth->h_proto != ETH_P_IP && eth->h_proto != ETH_P_IPV6) {
return TC_ACT_OK;
}
struct iphdr *iph = data + sizeof(*eth);
if ((void *) iph + sizeof(*iph) > data_end) {
return TC_ACT_OK;
}
/* do not support fragmented packets as L4 headers may be missing */
if (iph->frag_off & IP_FRAGMENTED) {
return TC_ACT_OK;
}
if (iph->protocol != IPPROTO_TCP) {
bpf_printk("Packet's not TCP - forwarding");
return TC_ACT_OK;
}
struct tcphdr *tcp = (void *) iph + sizeof(*iph);
if ((void *) tcp + sizeof(*tcp) > data_end) {
return TC_ACT_SHOT;
}
struct two_tuple dst = {
.ip = iph->daddr,
.port = tcp->dest
};
struct two_tuple *orig_src = bpf_map_lookup_elem(&conn_track, &dst);
if (orig_src == NULL) {
bpf_printk("No translation found - pass it through");
return TC_ACT_OK;
}
bpf_printk("Restore original source IP");
iph->saddr = orig_src->ip;
tcp->source = orig_src->port;
iph->tos = 7 << 2;
iph->check = 0;
iph->check = checksum((unsigned short *) iph, sizeof(struct iphdr));
return TC_ACT_OK;
};
SEC("classifier/ingress")
int ingress(struct __sk_buff *skb) {
bpf_printk("new packet captured on ingress (TC)");
void *data = (void *) (long) skb->data;
void *data_end = (void *) (long) skb->data_end;
struct ethhdr *eth = data;
if ((void *) eth + sizeof(*eth) > data_end) {
return TC_ACT_OK;
}
struct iphdr *iph = data + sizeof(*eth);
if ((void *) iph + sizeof(*iph) > data_end) {
return TC_ACT_OK;
}
/* do not support fragmented packets as L4 headers may be missing */
if (iph->frag_off & IP_FRAGMENTED) {
return TC_ACT_OK;
}
if (iph->protocol != IPPROTO_TCP) {
bpf_printk("Packet's not TCP - forwarding");
return TC_ACT_OK;
}
if (iph->daddr == 16845322) {
bpf_printk("We're the destination - don't touch it");
return TC_ACT_OK;
}
struct tcphdr *tcp = (void *) iph + sizeof(*iph);
if ((void *) tcp + sizeof(*tcp) > data_end) {
return TC_ACT_SHOT;
}
struct two_tuple src = {
.ip = iph->saddr,
.port = tcp->source
};
struct two_tuple dst = {
.ip = iph->daddr,
.port = tcp->dest
};
bpf_map_update_elem(&conn_track, &src, &dst, 0);
bpf_printk("Forward packet to localhost (TC)");
iph->daddr = 16845322;
iph->tos = 7 << 2;
iph->check = 0;
iph->check = checksum((unsigned short *) iph, sizeof(struct iphdr));
return TC_ACT_OK;
};
static inline enum xdp_action extract_meta(struct xdp_md *ctx, struct observed_packet *pkt) {
void *data = (void *) (long) ctx->data;
void *data_end = (void *) (long) ctx->data_end;
struct ethhdr *eth = data;
__u16 proto;
if (data + sizeof(struct ethhdr) > data_end) {
bpf_printk("Packet apparently not ethernet");
return XDP_DROP;
}
proto = eth->h_proto;
if (proto != bpf_htons(ETH_P_IP) && proto != bpf_htons(ETH_P_IPV6)) {
bpf_printk("Not an IP packet");
return XDP_PASS;
}
struct iphdr *iph = data + sizeof(*eth);
if ((void *) iph + sizeof(struct iphdr) > data_end) {
return XDP_DROP;
}
/* do not support fragmented packets as L4 headers may be missing */
if (iph->frag_off & IP_FRAGMENTED) {
return XDP_DROP;
}
pkt->sourceIp = iph->saddr;
pkt->destIp = iph->daddr;
__u8 ip_proto = iph->protocol;
if (ip_proto == IPPROTO_TCP) {
struct tcphdr *tcph = extract_tcp_meta(pkt, (void *) iph, sizeof(struct iphdr), data_end);
// if ACK flag is set we just pass it through because it belongs to an already established connection
if (tcph == NULL || tcph->ack) {
return XDP_PASS;
}
} else if (ip_proto == IPPROTO_UDP) {
struct udphdr *udph = extract_udp_meta(pkt, (void *) iph, sizeof(struct iphdr), data_end);
// could also check if we're the source
if (udph == NULL) {
return XDP_PASS;
}
}
return XDP_PASS;
}
SEC("xdp/perf")
int xdp_ingress_perf(struct xdp_md *ctx) {
struct observed_packet pkt;
enum xdp_action action = extract_meta(ctx, &pkt);
if (pkt.destIp == 0 || pkt.sourceIp == 0) {
return action;
}
if (!bpf_perf_event_output(ctx, &perf_observed_packets, BPF_F_CURRENT_CPU, &pkt, sizeof(struct observed_packet))) {
bpf_printk("Failed to submit observed packet");
}
return XDP_PASS;
}
SEC("xdp/ring")
int xdp_ingress_ring(struct xdp_md *ctx) {
struct observed_packet pkt = {};
enum xdp_action action = extract_meta(ctx, &pkt);
if (pkt.destIp == 0 || pkt.sourceIp == 0) {
return action;
}
bpf_ringbuf_output(&ring_observed_packets, &pkt, sizeof(pkt), 0);
return XDP_PASS;
}

View file

@ -0,0 +1,17 @@
struct observed_packet {
__u32 sourceIp;
__u32 destIp;
__u16 sourcePort;
__u16 destPort;
enum {
TCP,
UDP
} transport_proto;
};
struct two_tuple {
__u32 ip;
__u16 port;
__u16 _pad;
};

25
code/ebpf-xdp-tc/go.mod Normal file
View file

@ -0,0 +1,25 @@
module ebpf-xdp-tc
go 1.17
require github.com/DataDog/ebpf-manager v1.0.3
require (
github.com/DataDog/gopsutil v0.0.0-20200624212600-1b53412ef321 // indirect
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect
github.com/avast/retry-go v3.0.0+incompatible // indirect
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 // indirect
github.com/cilium/ebpf v0.6.3-0.20210917122031-fc2955d2ecee // indirect
github.com/florianl/go-tc v0.3.0 // indirect
github.com/go-ole/go-ole v1.2.4 // indirect
github.com/google/go-cmp v0.5.4 // indirect
github.com/hashicorp/errwrap v1.0.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/josharian/native v0.0.0-20200817173448-b6b71def0850 // indirect
github.com/mdlayher/netlink v1.4.0 // indirect
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect
github.com/vishvananda/netlink v1.1.0 // indirect
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect
golang.org/x/net v0.0.0-20210119194325-5f4716e94777 // indirect
golang.org/x/sys v0.0.0-20210921065528-437939a70204 // indirect
)

114
code/ebpf-xdp-tc/go.sum Normal file
View file

@ -0,0 +1,114 @@
github.com/DataDog/ebpf-manager v1.0.3 h1:zeuFyHmP4/m8uqx7LyLHkHKbmrDjKkk34bz324tBOlc=
github.com/DataDog/ebpf-manager v1.0.3/go.mod h1:05Y9FhEyILUdCovBthi5y4KPY8AfUg5EbMNC6RMQXDY=
github.com/DataDog/gopsutil v0.0.0-20200624212600-1b53412ef321 h1:OPAXA+r6yznoxWR5jQ2iTh5CvzIMrdw8AU0uFN2RwEw=
github.com/DataDog/gopsutil v0.0.0-20200624212600-1b53412ef321/go.mod h1:tGQp6XG4XpOyy67WG/YWXVxzOY6LejK35e8KcQhtRIQ=
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 h1:UUppSQnhf4Yc6xGxSkoQpPhb7RVzuv5Nb1mwJ5VId9s=
github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg=
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 h1:kHaBemcxl8o/pQ5VM1c8PVE1PubbNx3mjUr09OqWGCs=
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575/go.mod h1:9d6lWj8KzO/fd/NrVaLscBKmPigpZpn5YawRPw+e3Yo=
github.com/cilium/ebpf v0.6.3-0.20210917122031-fc2955d2ecee h1:eg3Xm5uBYJLRDVq750EFFx9CHTOEFIH/MjLNNpyTS3Y=
github.com/cilium/ebpf v0.6.3-0.20210917122031-fc2955d2ecee/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/florianl/go-tc v0.3.0 h1:qeqQB5kp2lwJP1p/8krLQIuRfkHWpiPPcYr3rhRSaC8=
github.com/florianl/go-tc v0.3.0/go.mod h1:Ni/GTSK8ymDnsRQfL2meJeGmcXy7RFIvchiVHizU76U=
github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
github.com/go-ole/go-ole v1.2.4 h1:nNBDSCOigTSiarFpYE9J/KtEA1IOW4CNeqT9TQDqCxI=
github.com/go-ole/go-ole v1.2.4/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/josharian/native v0.0.0-20200817173448-b6b71def0850 h1:uhL5Gw7BINiiPAo24A2sxkcDI0Jt/sqp1v5xQCniEFA=
github.com/josharian/native v0.0.0-20200817173448-b6b71def0850/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
github.com/jsimonetti/rtnetlink v0.0.0-20190606172950-9527aa82566a/go.mod h1:Oz+70psSo5OFh8DBl0Zv2ACw7Esh6pPUphlvZG9x7uw=
github.com/jsimonetti/rtnetlink v0.0.0-20200117123717-f846d4f6c1f4/go.mod h1:WGuG/smIU4J/54PblvSbh+xvCZmpJnFgr3ds6Z55XMQ=
github.com/jsimonetti/rtnetlink v0.0.0-20201009170750-9c6f07d100c1/go.mod h1:hqoO/u39cqLeBLebZ8fWdE96O7FxrAsRYhnVOdgHxok=
github.com/jsimonetti/rtnetlink v0.0.0-20201216134343-bde56ed16391/go.mod h1:cR77jAZG3Y3bsb8hF6fHJbFoyFukLFOkQ98S0pQz3xw=
github.com/jsimonetti/rtnetlink v0.0.0-20201220180245-69540ac93943/go.mod h1:z4c53zj6Eex712ROyh8WI0ihysb5j2ROyV42iNogmAs=
github.com/jsimonetti/rtnetlink v0.0.0-20210122163228-8d122574c736/go.mod h1:ZXpIyOK59ZnN7J0BV99cZUPmsqDRZ3eq5X+st7u/oSA=
github.com/jsimonetti/rtnetlink v0.0.0-20210212075122-66c871082f2b h1:c3NTyLNozICy8B4mlMXemD3z/gXgQzVXZS/HqT+i3do=
github.com/jsimonetti/rtnetlink v0.0.0-20210212075122-66c871082f2b/go.mod h1:8w9Rh8m+aHZIG69YPGGem1i5VzoyRC8nw2kA8B+ik5U=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mdlayher/ethtool v0.0.0-20210210192532-2b88debcdd43 h1:WgyLFv10Ov49JAQI/ZLUkCZ7VJS3r74hwFIGXJsgZlY=
github.com/mdlayher/ethtool v0.0.0-20210210192532-2b88debcdd43/go.mod h1:+t7E0lkKfbBsebllff1xdTmyJt8lH37niI6kwFk9OTo=
github.com/mdlayher/genetlink v1.0.0 h1:OoHN1OdyEIkScEmRgxLEe2M9U8ClMytqA5niynLtfj0=
github.com/mdlayher/genetlink v1.0.0/go.mod h1:0rJ0h4itni50A86M2kHcgS85ttZazNt7a8H2a2cw0Gc=
github.com/mdlayher/netlink v0.0.0-20190409211403-11939a169225/go.mod h1:eQB3mZE4aiYnlUsyGGCOpPETfdQq4Jhsgf1fk3cwQaA=
github.com/mdlayher/netlink v1.0.0/go.mod h1:KxeJAFOFLG6AjpyDkQ/iIhxygIUKD+vcwqcnu43w/+M=
github.com/mdlayher/netlink v1.1.0/go.mod h1:H4WCitaheIsdF9yOYu8CFmCgQthAPIWZmcKp9uZHgmY=
github.com/mdlayher/netlink v1.1.1/go.mod h1:WTYpFb/WTvlRJAyKhZL5/uy69TDDpHHu2VZmb2XgV7o=
github.com/mdlayher/netlink v1.2.0/go.mod h1:kwVW1io0AZy9A1E2YYgaD4Cj+C+GPkU6klXCMzIJ9p8=
github.com/mdlayher/netlink v1.2.1/go.mod h1:bacnNlfhqHqqLo4WsYeXSqfyXkInQ9JneWI68v1KwSU=
github.com/mdlayher/netlink v1.2.2-0.20210123213345-5cc92139ae3e/go.mod h1:bacnNlfhqHqqLo4WsYeXSqfyXkInQ9JneWI68v1KwSU=
github.com/mdlayher/netlink v1.3.0/go.mod h1:xK/BssKuwcRXHrtN04UBkwQ6dY9VviGGuriDdoPSWys=
github.com/mdlayher/netlink v1.4.0 h1:n3ARR+Fm0dDv37dj5wSWZXDKcy+U0zwcXS3zKMnSiT0=
github.com/mdlayher/netlink v1.4.0/go.mod h1:dRJi5IABcZpBD2A3D0Mv/AiX8I9uDEu5oGkAVrekmf8=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0=
github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20201216054612-986b41b23924/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210119194325-5f4716e94777 h1:003p0dJM77cxMSyCPFphvZf/Y5/NXf5fzg6ufd1/Oew=
golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190411185658-b44545bcd369/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201009025420-dfb3f7c4e634/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201118182958-a01c418693c7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201218084310-7d0127a74742/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210110051926-789bb1bd4061/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210123111255-9b0068b26619/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210216163648-f7da38b97c65/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210921065528-437939a70204 h1:JJhkWtBuTQKyz2bd5WG9H8iUsJRU3En/KRfN8B2RnDs=
golang.org/x/sys v0.0.0-20210921065528-437939a70204/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

190
code/ebpf-xdp-tc/main.go Normal file
View file

@ -0,0 +1,190 @@
package main
import (
"bytes"
"context"
_ "embed"
"errors"
"fmt"
"log"
"net/http"
"os"
"os/signal"
manager "github.com/DataDog/ebpf-manager"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/features"
)
var (
//go:embed ebpf/bin/probe.o
ebpfTCProgram []byte
xdpMonitorPerf = &manager.Probe{
ProbeIdentificationPair: manager.ProbeIdentificationPair{
EBPFSection: "xdp/perf",
EBPFFuncName: "xdp_ingress_perf",
},
Ifname: "eth0",
NetworkDirection: manager.Ingress,
}
xdpMonitorRing = &manager.Probe{
ProbeIdentificationPair: manager.ProbeIdentificationPair{
EBPFSection: "xdp/ring",
EBPFFuncName: "xdp_ingress_ring",
},
Ifname: "eth0",
NetworkDirection: manager.Ingress,
}
mgr = &manager.Manager{
Probes: []*manager.Probe{
{
ProbeIdentificationPair: manager.ProbeIdentificationPair{
EBPFSection: "classifier/egress",
EBPFFuncName: "egress",
},
Ifname: "eth0",
NetworkDirection: manager.Egress,
},
{
ProbeIdentificationPair: manager.ProbeIdentificationPair{
EBPFSection: "classifier/ingress",
EBPFFuncName: "ingress",
},
Ifname: "eth0",
NetworkDirection: manager.Ingress,
},
},
}
)
type (
packetMonitorMode uint8
packetReader interface {
Read() (*Packet, error)
Close() error
}
)
const (
packetMonitorModeRing packetMonitorMode = iota
packetMonitorModePerfEvent
)
func main() {
var monitorMode packetMonitorMode
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill)
defer cancel()
mgrOpts := manager.Options{
ExcludedFunctions: nil,
RLimit: nil,
}
if err := features.HaveMapType(ebpf.RingBuf); err != nil {
if errors.Is(err, ebpf.ErrNotSupported) {
log.Println("Falling back to perf event reader")
mgr.Probes = append(mgr.Probes, xdpMonitorPerf)
monitorMode = packetMonitorModePerfEvent
mgrOpts.ExcludedFunctions = append(mgrOpts.ExcludedFunctions, xdpMonitorRing.EBPFFuncName)
} else {
log.Fatalf("God knows what happened: %v\n", err)
}
} else {
log.Println("Using fancy new ringbuf reader")
mgr.Probes = append(mgr.Probes, xdpMonitorRing)
monitorMode = packetMonitorModeRing
mgrOpts.ExcludedFunctions = append(mgrOpts.ExcludedFunctions, xdpMonitorPerf.EBPFFuncName)
}
if err := mgr.InitWithOptions(bytes.NewReader(ebpfTCProgram), mgrOpts); err != nil {
log.Fatalf("Failed to init manager: %v", err)
}
if err := mgr.Start(); err != nil {
log.Fatalf("Failed to start manager: %v", err)
}
runHTTPServer()
var reader packetReader
switch monitorMode {
case packetMonitorModeRing:
if r, err := createRingBufReader(mgr); err != nil {
log.Fatalf("Failed to create rinbuf reader: %v\n", err)
} else {
reader = r
}
case packetMonitorModePerfEvent:
if r, err := createPerfEventReader(mgr); err != nil {
log.Fatalf("Failed to create perf_event reader: %v\n", err)
} else {
reader = r
}
}
go logEventsFromReader(ctx, reader)
<-ctx.Done()
if err := mgr.Stop(manager.CleanAll); err != nil {
log.Fatalf("Failed to stop manager: %v", err)
}
}
func logEventsFromReader(ctx context.Context, reader packetReader) {
log.Println("Waiting for received packets")
defer func() {
if err := reader.Close(); err != nil {
log.Fatalf("Failed to close reader: %v\n", err)
}
}()
for ctx.Err() == nil {
if pkt, err := reader.Read(); err != nil {
log.Printf("Error occurred while reading packet: %v\n", err)
} else {
log.Println(pkt)
}
}
}
func createRingBufReader(mgr *manager.Manager) (packetReader, error) {
if m, present, err := mgr.GetMap("ring_observed_packets"); err != nil {
return nil, err
} else if !present {
return nil, fmt.Errorf("ring_observed_packets map not loaded")
} else {
return NewRingBufReader(m)
}
}
func createPerfEventReader(mgr *manager.Manager) (packetReader, error) {
if m, present, err := mgr.GetMap("perf_observed_packets"); err != nil {
return nil, err
} else if !present {
return nil, errors.New("perf_observed_packets map not loaded")
} else {
return NewPerfEventReader(m, 8)
}
}
func runHTTPServer() {
log.Println("Listening on: 0.0.0.0:80")
go func() {
err := http.ListenAndServe("0.0.0.0:80", http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) {
log.Println("Handling request")
writer.WriteHeader(200)
_, _ = writer.Write([]byte("Hello, world!"))
}))
if err != nil {
if errors.Is(err, http.ErrServerClosed) {
return
}
log.Printf("Error serving HTTP: %v", err)
}
}()
}

View file

@ -0,0 +1,39 @@
package main
import (
"bytes"
"encoding/binary"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/perf"
)
func NewPerfEventReader(m *ebpf.Map, perCPUBufferSize int) (reader *PerfEventReader, err error) {
reader = new(PerfEventReader)
if reader.reader, err = perf.NewReader(m, perCPUBufferSize); err != nil {
return nil, err
} else {
return reader, nil
}
}
type PerfEventReader struct {
reader *perf.Reader
}
func (r *PerfEventReader) Read() (*Packet, error) {
var pkt observedPacket
if rec, err := r.reader.Read(); err != nil {
return nil, err
} else {
if err = binary.Read(bytes.NewReader(rec.RawSample), binary.LittleEndian, &pkt); err != nil {
return nil, err
} else {
return pkt.ToPacket(), nil
}
}
}
func (r *PerfEventReader) Close() error {
return r.reader.Close()
}

View file

@ -0,0 +1,39 @@
package main
import (
"bytes"
"encoding/binary"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/ringbuf"
)
func NewRingBufReader(m *ebpf.Map) (reader *RingBufReader, err error) {
reader = new(RingBufReader)
if reader.reader, err = ringbuf.NewReader(m); err != nil {
return nil, err
} else {
return reader, nil
}
}
type RingBufReader struct {
reader *ringbuf.Reader
}
func (r *RingBufReader) Read() (*Packet, error) {
var pkt observedPacket
if rec, err := r.reader.Read(); err != nil {
return nil, err
} else {
if err = binary.Read(bytes.NewReader(rec.RawSample), binary.LittleEndian, &pkt); err != nil {
return nil, err
} else {
return pkt.ToPacket(), nil
}
}
}
func (r *RingBufReader) Close() error {
return r.reader.Close()
}

58
code/ebpf-xdp-tc/types.go Normal file
View file

@ -0,0 +1,58 @@
package main
import (
"encoding/binary"
"net"
)
type Protocol uint8
const (
ProtocolTCP Protocol = iota
ProtocolUDP
)
type Packet struct {
SourceIP net.IP
DestIP net.IP
SourcePort uint16
DestPort uint16
Transport Protocol
}
type observedPacket struct {
SourceIP uint32
DestIP uint32
SourcePort uint16
DestPort uint16
Transport Protocol
}
func (p *observedPacket) ToPacket() *Packet {
return &Packet{
SourceIP: int2ip(p.SourceIP),
DestIP: int2ip(p.DestIP),
SourcePort: p.SourcePort,
DestPort: p.DestPort,
Transport: p.Transport,
}
}
func int2ip(nn uint32) net.IP {
ip := make(net.IP, net.IPv4len)
binary.LittleEndian.PutUint32(ip, nn)
return ip
}
func ip2int(ip net.IP) uint32 {
b := make([]byte, net.IPv4len)
copy(b, ip.To4())
reverse(b)
return binary.BigEndian.Uint32(b)
}
func reverse(input []byte) {
for i := 0; i < len(input)/2; i++ {
input[i], input[len(input)-1-i] = input[len(input)-1-i], input[i]
}
}

View file

@ -2,7 +2,7 @@
<name>containers</name>
<uuid>929b7b7d-bd82-452d-96b7-12f0cf1a4b17</uuid>
<bridge name='conbr0' stp='on' delay='0'/>
<mac address='af:af:13:ed:c6:41'/>
<mac address='00:30:c4:86:c9:1c'/>
<ip address='10.10.1.42' netmask='255.255.255.0'>
<dhcp>
<range start='10.10.1.100' end='10.10.1.150'/>

View file

@ -13,6 +13,7 @@ author = "Peter Kurfer"
style= "auto"
copyCodeButton = true
rssAsSocialIcon = true
custom_css = ["css/custom.css"]
[[menu.main]]
identifier = "about"

334
content/post/go-ebpf-tc.md Normal file
View file

@ -0,0 +1,334 @@
+++
author = "Peter Kurfer"
title = "eBPF traffic control a.k.a tc with Go"
date = "2022-02-24"
description = "Build your own DNAT with eBPF traffic control and Go"
tags = [
"golang",
"ebpf",
"tc"
]
+++
While working on my 'pet project' [INetMock](https://gitlab.com/inetmock/inetmock) I realized early that it would be amazing to be able to monitor traffic that 'flows by' to see not only traffic I could actually handle but also to get some information which high ports were requested.
Unfortunately back then I was only aware of PCAP capturing and it seemed rather complicated to implement what I wanted on top of PCAPs - or at least rather computation intense.
## eBPF for the rescue
At work I'm part of the platform team maintaining a bunch of Kubernetes clusters and this is where I first stumbled upon eBPF when we migrated from Calico to Cilium.
So you might ask what is eBPF?!
Very short answer: [RTFM](https://ebpf.io/what-is-ebpf) :nerd:
eBPF is the abbreviation for 'Extended Berkeley Packet Filter'.
While originally intended to be used for network traffic analysis it got some major upgrades over the past years so that you can now:
* monitor/control syscalls like [Falco](https://sysdig.com/opensource/falco/) is doing it
* implement a very mighty [keylogger](https://arighi.blogspot.com/2018/12/linux-easy-keylogger-with-ebpf.html)
* [build DDoS attack prevention systems](https://blog.cloudflare.com/l4drop-xdp-ebpf-based-ddos-mitigations/)
* [L4 load balancers](https://blog.cloudflare.com/unimog-cloudflares-edge-load-balancer/)
* [Making a firewall using eBPFs and cgroups](https://nfil.dev/coding/security/ebpf-firewall-with-cgroups/)
* ...
After some initial research I figured using XDP (eXpress Data Path) would be best to implement the kind of monitoring I had in mind.
XDP is also used by Cloudflare for their DDoS prevention system and for their load balancer hence what's good enough for them could be good enough for me (irony intended).
So I was looking for some 'examples' a.k.a. code I can copy and adopt to get started quickly.
I stumbled upon a project monitoring that monitored the API server traffic in a Kubernetes cluster and this was a lot like what I wanted to do!
At least it looked like it at this point.
It took me a few evening to get my prototype working and I read a lot more about the topic whenever possible.
Coincidentally we had a firewall/network misconfiguration issue at work at the same time when I tried to get my first prototype live in the staging environment and it hit me:
why not using ebpf/XDP not only for monitoring but also to get rid of `iptables` and build my own 'firewall'! :heart_eyes:
Don't misunderstand me, `iptables` is a perfectly fine and perfectly working solution and it served me well!
But it's always a bit annoying when I start the server and forgot to run the ominous `iptables` command I originally copied from the _INetSim_ docs that takes care of the DNATing and suddenly nothing works.
## 1st approach: XDP
To be honest: I used and configured both SNAT and DNAT often in the past and I had a basic idea how this all works but...this 'tiny bit of knowledge' rapidly crumbled into nothing as soon as I tried to build it myself.
The idea is simple.
In general the communication of a client with an external server looks like this:
{{<mermaid>}}
%%{init: {'theme': 'neutral'}}%%
flowchart LR
client([Client]) --> |IP packet|router{{Router}}
router --> |IP packet|srv>Server]
srv -->|response|router
router-->|esponse|client
{{< /mermaid>}}
(Ignoring everything that is actually required like DNS, TCP handshakes,...)
What I wanted to achieve looks like this:
{{<mermaid>}}
%%{init: {'theme': 'neutral'}}%%
flowchart LR
client([Client]) --> |IP packet|router{{Router}}
router --> |modifiedIP packet|router
router-->|faked response|client
{{< /mermaid>}}
I already knew it's possible and even intended to re-route packets with XDP because that's exactly what Cloudflare is doing with Unimog.
I tried to find some more examples on how to do redirect traffic and at this point I realized how much I **did/do** not know:
* Do I need to update any checksums?
* If I only modify the IP header, do I also need to fix TCP/UDP checksums?
* Is modifying the IP enough or do I also need to modify the destination MAC address of the ethernet packet?
And I bet there's even more I still haven't even thought about...
It's kind of hart to admit but it took me actually a lot of time to realize why my XDP approach wasn't working and will not ever work in the setup I have in mind.
One disadvantage of diving directly into the code and focusing on examples is: you've no idea what you're actually doing!
XDP is so great because depending on your hardware it is even possible to execute the program directly on the NIC but one huge disadvantage (for me) is, that it only captures **_ingress_** traffic!
So I perfectly screwed up because my whole traffic was within a single network segment and looked like in the following diagram.
I'm now using a sequence diagram to make things a bit more explicit.
Assuming the following network configuration:
{{< table "center" >}}
| Actor | IP |
| ------ | -------------- |
| Client | 192.168.178.51 |
| Router | 192.168.178.1 |
{{</ table >}}
{{<mermaid>}}
%%{init: {'theme': 'neutral'}}%%
sequenceDiagram
Note over Client,Router: .51:34578 &rarr; 1.2.3.4:80
Client->>Router: TCP SYN
Note over Client,Router: .51:34578 &rarr; 192.168.178.1:80
Router->>Router: Redirect to 192.168.178.1:80
Note over Client,Router: .51:34578 &larr; .1:80
Router->>Client: SYN-ACK
{{< /mermaid>}}
But the `Client` didn't try to connect to `192.168.178.1` hence it won't accept the packet.
While this is kind of obvious it took me quite some time to get my head around this.
Not only but also because it's hard to observe this if you're using XDP because XDP forwarded/modified packets are not included in a PCAP.
Fortunately there's [`xdp-dump`](https://github.com/xdp-project/xdp-tools) to get a better understanding what's going on.
Okay, so I've to maintain a mapping of the original 4-tuple to my modified one to be able to restore the original source after the network stack handled the packet.
eBPF has a few different map types to store data between program invocations (I'll cover that later) so this wasn't a problem.
But now it hit me, with XDP I cannot manipulate egress (outgoing) packets.
So XDP's a dead end for this use case (although perfectly fine for the monitoring!).
## What if XDP is not enough?
Another round of 'research' revealed there are even more points in the network stack already where I could attach eBPF programs.
Every trace point has slightly different options (and therefore make sense to be there).
There are:
* `BPF_PROG_TYPE_XDP`: earliest possible point to get your hands on ingress traffic, can be used to monitor (and pass) incoming traffic, drop or redirect traffic
* `BPF_PROG_TYPE_SOCKET_FILTER`: drop or trim packets at socket level (much later than with XDP)
* `BPF_PROG_TYPE_CGROUP_SOCK`: much like XDP but within network cgroups
* ...
_A full list of program types can be found in [include/uapi/linux/bpf.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h#L920) in the Linux kernel source.
A good introduction into the different types of eBPF programs can be found [here](https://blogs.oracle.com/linux/post/bpf-a-tour-of-program-types) (can't believe I'm linking an Oracle document).
All of the aforementioned options have in common that they can be used to filter packets but what I needed was an option to _modify egress_ packets.
AFAIK the only available option for this is `tc` (a.k.a. traffic control) which is Linux' QoS system.
Even though you get a lot of 'high level' information about what it is, that it supports (e)BPF and that there's the `tc` CLI to interact with it - also lot's of examples how to use the CLI - I could barely find a library/documentation about the API to not requiring shell outs.
Finally I found DataDog's [ebpf-manager](https://github.com/DataDog/ebpf-manager/) which uses Cilium's [ebpf](https://github.com/cilium/ebpf) and [go-tc](https://github.com/florianl/go-tc) to attach eBPF programs to the `tc` subsystem.
Actually not only this it also comes with a pretty handy manager layer to make working with eBPF a charm.
## `tc` in action
From now I'd like to dig a bit into the source code - all sources can be found [in the repo](https://github.com/baez90/baez90.github.io) under `code/ebpf-xdp-tc`.
The setup for experimenting looks like this:
{{<mermaid>}}
%%{init: {'theme': 'neutral'}}%%
flowchart LR
subgraph libvirtNet [isolated Libvirt network]
vm["Windows VM"] --> container["Podman container"]
end
{{< /mermaid>}}
and is based on my post on how to [join a Podman container to a Libvirt network]({{< relref "libvirt-podman-network-mesh.md" >}}).
The resulting workflow should be looking like so:
{{<mermaid>}}
%%{init: {'theme': 'neutral'}}%%
sequenceDiagram
Client->>Ingress: IP packet
Ingress->>Ingress: Rewrite packet and store original destination
Ingress->>Server: Forward packet
Server->>Egress: Intercept response packet
Egress->>Egress: Restore original destination as source
Egress->>Client: Forward packet
{{< /mermaid>}}
So in short words: the client sends an IP packet to an IP outside of the local network hence it will be sent to the gateway (which happens to be my Podman container).
The eBPF program attached to the `tc` ingress hook takes the incoming packet, rewrites it's destination to the local IP of the Podman container and passes the modified packet on.
In my experiment I'm using a simple HTTP server to respond to every HTTP request with a plain text message and a status code `200`.
The network stack processes the packet and replies e.g. with a `SYN-ACK` to the client's IP address.
The eBPF program attached to the `tc` egress hook intercepts the response, restores the original source source IP based on the client's IP address and TCP/UDP port and forwards the packet.
### Ingress traffic
Every eBPF program needs a section identifier and has to fulfill some constraints to pass the verifier.
Currently this means for instance that loops are not allowed to ensure the program has a guaranteed end.
The simplest ingress hook would look like this:
```c
#include <linux/bpf.h>
#include <linux/pkt_cls.h>
SEC("classifier/ingress")
int ingress(struct __sk_buff *skb) {
return TC_ACT_SHOT;
}
```
This program would simply drop every incoming packet but it's a valid program.
The parameter - in this case `struct __sk_buff *skb` - depends on the trace point.
The `skb` is the most powerful one and in theory it's even possible to extract HTTP parameters out of it.
See for example [this article](http://vger.kernel.org/~davem/skb_data.html) for further details.
XDP programs are receiving another parameter that is a bit more generic but also less expensive to initialize.
Either way, you can easily (and rather cheap) extract the different parts of the IP packet just with a few lines of code.
A quick reminder how an IP packet looks like:
{{< figure
src="https://upload.wikimedia.org/wikipedia/commons/3/3b/UDP_encapsulation.svg"
link="https://commons.wikimedia.org/wiki/File:UDP_encapsulation.svg"
target="__blank"
caption="en:User:Cburnett original work, colorization by en:User:Kbrose, [CC BY-SA 3.0](http://creativecommons.org/licenses/by-sa/3.0/), via Wikimedia Commons"
title="UDP encapsulation"
>}}
In my simplified case I assume the `Frame header` will be a `struct ethhdr`, followed by the `struct iphdr` and then either a `struct udphdr` or a `struct tcphdr`.
To satisfy the verifier you've to validate after every cast that you haven't reached already the end of the current `skb` to ensure memory safety which is both: a bit annoying and a lot calming because memory issues are avoided right away.
So assuming we just want to 'print' the source and destination address of every packet reaching our ingress hook we would do the following:
```c
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/pkt_cls.h>
SEC("classifier/ingress")
int ingress(struct __sk_buff *skb) {
void *data = (void *) (long) skb->data;
void *data_end = (void *) (long) skb->data_end;
struct ethhdr *eth = data;
// apparently not an ethernet packet
if ((void *) eth + sizeof(*eth) > data_end) {
return TC_ACT_OK;
}
// ignore packet that are neither IPv4 nor IPv6
if(eth->h_proto != ETH_P_IP && eth->h_proto != ETH_P_IPV6) {
return TC_ACT_OK;
}
struct iphdr *iph = data + sizeof(*eth);
if ((void *) iph + sizeof(*iph) > data_end) {
return TC_ACT_OK;
}
bpf_printk("Packet from %d to %d\n", iph->saddr, iph->daddr);
return TC_ACT_OK;
}
```
That's already a bit more code, isn't it?
So we start by capturing the beginning and the end of the packet.
As mentioned previously it's possible to 'extract' the individual parts of the packet just by casting the right offsets.
Of course some additional sanity checks are necessary to make sure we don't misinterpret anything.
For instance if the current packet is not an IP packet but probably an ARP packet we just let it pass.
What's worth mentioned is the `bpf_printk` macro because it's particularly useful - although it's slower than other options but it's pretty easy to use for debugging.
To get the message we're sending with it you can simply do
```sh
sudo cat /sys/kernel/debug/tracing/trace_pipe
```
and you're good to go!
#### eBPF maps
So remembering the sequence diagram above we are already almost finished regarding the parsing of the required information but how do we store the gather information?
eBPF comes with a bunch of different maps.
All types can be found in [include/uapi/linux/bpf.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h#L878) or with more details [here](https://prototype-kernel.readthedocs.io/en/latest/bpf/ebpf_maps_types.html).
Some of them are equivalent to ordinary data structures most developers are used to like:
* `BPF_MAP_TYPE_ARRAY` behaves like an ordinary array
* `BPF_MAP_TYPE_HASH` behaves like an ordinary map/dictionary
but others like `BPF_MAP_TYPE_PERF_EVENT_ARRAY` or `BPF_MAP_TYPE_RINGBUF` are rather special.
Fow now we're focusing on `BPF_MAP_TYPE_HASH` because we can use it to store the `orig-src` &rarr; `orig-dest` mapping.
To store data in a map and load it later on eBPF exposes some [bpf-helpers](https://www.man7.org/linux/man-pages/man7/bpf-helpers.7.html):
* `long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)` to store data in a map
* `void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)` to load data
An extension of the previous program could look like so:
```c
/*
... includes
*/
#define IP_FRAGMENTED 65343
// source IP and port
struct two_tuple {
__u32 ip;
__u16 port;
__u16 _pad; // required to pad the size of the struct to a multiple of 4
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, sizeof(struct two_tuple));
__type(value, sizeof(struct two_tuple));
__uint(max_entries, 1024);
} conn_track SEC(".maps");
SEC("classifier/ingress")
int ingress(struct __sk_buff *skb) {
// ... previous logic
// do not support fragmented packets as L4 headers may be missing
if (iph->frag_off & IP_FRAGMENTED) {
return TC_ACT_OK;
}
if (iph->protocol != IPPROTO_TCP) {
return TC_ACT_OK;
}
struct tcphdr *tcp = (void *) iph + sizeof(*iph);
if ((void *) tcp + sizeof(*tcp) > data_end) {
return TC_ACT_SHOT;
}
struct two_tuple src = {
.ip = iph->saddr,
.port = tcp->source
};
struct two_tuple dst = {
.ip = iph->daddr,
.port = tcp->dest
};
bpf_map_update_elem(&conn_track, &src, &dst, 0);
}
```

View file

@ -1 +1,4 @@
<!--for overriding-->
<!--for overriding-->
{{ range .Site.Params.custom_css -}}
<link rel="stylesheet" href="{{ . | absURL }}">
{{- end }}

View file

@ -0,0 +1,6 @@
{{ $htmlTable := .Inner | markdownify }}
{{ $class := .Get 0 }}
{{ $old := "<table>" }}
{{ $new := printf "<table class=\"%s\">" $class }}
{{ $htmlTable := replace $htmlTable $old $new }}
{{ $htmlTable | safeHTML }}

6
static/css/custom.css Normal file
View file

@ -0,0 +1,6 @@
.center {
margin-left: auto;
margin-right: auto;
margin-top: 1.5em;
margin-bottom: 1.5em;
}