diff --git a/.gitignore b/.gitignore index b16cc9c..8b57e6b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,7 @@ hugo.linux /.hugo_build.lock # IntelliJ project files -.idea/ \ No newline at end of file +.idea/ + +# Vagrant files +.vagrant/ diff --git a/code/ebpf-xdp-tc/.gitignore b/code/ebpf-xdp-tc/.gitignore new file mode 100644 index 0000000..2c71fd0 --- /dev/null +++ b/code/ebpf-xdp-tc/.gitignore @@ -0,0 +1,5 @@ +*.o +CMakeLists.txt + +out/ +cmake-build-debug/ \ No newline at end of file diff --git a/code/ebpf-xdp-tc/Dockerfile b/code/ebpf-xdp-tc/Dockerfile new file mode 100644 index 0000000..679601e --- /dev/null +++ b/code/ebpf-xdp-tc/Dockerfile @@ -0,0 +1,7 @@ +FROM docker.io/alpine:3.15 + +WORKDIR /app + +COPY out/ebpf-xdp-tc ./ + +ENTRYPOINT ["/app/ebpf-xdp-tc"] \ No newline at end of file diff --git a/code/ebpf-xdp-tc/Taskfile.yml b/code/ebpf-xdp-tc/Taskfile.yml new file mode 100644 index 0000000..5b057ee --- /dev/null +++ b/code/ebpf-xdp-tc/Taskfile.yml @@ -0,0 +1,58 @@ +# https://taskfile.dev + +version: '3' + +tasks: + default: + + build-ebpf: + cmds: + - mkdir -p ebpf/bin + - | + clang \ + -Wno-unused-value \ + -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ + -Wunused \ + -Wall \ + -fno-stack-protector \ + -fno-ident \ + -g \ + -O2 -emit-llvm \ + ebpf/main.c \ + -c -o - | llc -march=bpf -mcpu=probe -filetype=obj -o ebpf/bin/probe.o + + build-bin: + deps: + - build-ebpf + env: + GOOS: linux + GOARCH: amd64 + cmds: + - mkdir -p out/ + - go build -o out/ebpf-xdp-tc -trimpath -a -installsuffix=cgo -ldflags "-w -s -linkmode external -extldflags -static" ./ + + build-docker: + deps: + - build-bin + cmds: + - buildah bud -t ebpf-xdp-tc . + + run-in-container: + deps: + - build-docker + cmds: + - | + podman run \ + --rm \ + -ti \ + -v /sys:/sys:ro \ + --security-opt=seccomp=unconfined \ + --network=libvirt \ + --ip "10.10.1.1" \ + --name ebpf-xdp-tc \ + --cap-add=CAP_SYS_ADMIN \ + --cap-add=CAP_NET_RAW \ + --cap-add=CAP_NET_BIND_SERVICE \ + --cap-add=CAP_NET_ADMIN \ + ebpf-xdp-tc \ No newline at end of file diff --git a/code/ebpf-xdp-tc/Vagrantfile b/code/ebpf-xdp-tc/Vagrantfile new file mode 100644 index 0000000..47caf44 --- /dev/null +++ b/code/ebpf-xdp-tc/Vagrantfile @@ -0,0 +1,22 @@ +Vagrant.configure("2") do |config| + # The most common configuration options are documented and commented below. + # For a complete reference, please see the online documentation at + # https://docs.vagrantup.com. + + # Every Vagrant development environment requires a box. You can search for + # boxes at https://vagrantcloud.com/search. + config.vm.box = "peru/windows-10-enterprise-x64-eval" + config.vm.box_version = "20220202.01" + + config.vm.provider "libvirt" do |libvirt| + libvirt.management_network_mode = 'veryisoled' + end + + + config.vm.define :win_victim do |win_victim| + win_victim.vm.network :private_network, + :libvirt__network_name => "containers" + end + + config.vm.box_check_update = false +end diff --git a/code/ebpf-xdp-tc/ebpf/helpers.h b/code/ebpf-xdp-tc/ebpf/helpers.h new file mode 100644 index 0000000..5ac2b7f --- /dev/null +++ b/code/ebpf-xdp-tc/ebpf/helpers.h @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "types.h" + +static inline unsigned short checksum(unsigned short *buf, int bufsz) { + unsigned long sum = 0; + + while (bufsz > 1) { + sum += *buf; + buf++; + bufsz -= 2; + } + + if (bufsz == 1) { + sum += *(unsigned char *) buf; + } + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +static inline struct tcphdr *extract_tcp_meta(struct observed_packet *pkt, void *iph, __u64 off, void *data_end) { + struct tcphdr *hdr = iph + off; + if ((void *) hdr + sizeof(*hdr) > data_end) { + return NULL; + } + pkt->transport_proto = TCP; + pkt->sourcePort = bpf_ntohs(hdr->source); + pkt->destPort = bpf_ntohs(hdr->dest); + + return hdr; +} + +static inline struct udphdr *extract_udp_meta(struct observed_packet *pkt, void *iph, __u64 off, void *data_end) { + struct udphdr *hdr = iph + off; + if ((void *) hdr + sizeof(*hdr) > data_end) { + return NULL; + } + + pkt->transport_proto = UDP; + pkt->sourcePort = bpf_ntohs(hdr->source); + pkt->destPort = bpf_ntohs(hdr->dest); + return hdr; +} diff --git a/code/ebpf-xdp-tc/ebpf/main.c b/code/ebpf-xdp-tc/ebpf/main.c new file mode 100644 index 0000000..e71feea --- /dev/null +++ b/code/ebpf-xdp-tc/ebpf/main.c @@ -0,0 +1,222 @@ +#include +#include + +#include "helpers.h" + +#define IP_FRAGMENTED 65343 + +char LICENSE[] SEC("license") = "Dual MIT/GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); +} perf_observed_packets SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 24); +} ring_observed_packets SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, sizeof(struct two_tuple)); + __type(value, sizeof(struct two_tuple)); + __uint(max_entries, 1024); +} conn_track SEC(".maps"); + +SEC("classifier/egress") +int egress(struct __sk_buff *skb) { + bpf_printk("new packet captured on egress (TC)\n"); + void *data = (void *) (long) skb->data; + void *data_end = (void *) (long) skb->data_end; + + struct ethhdr *eth = data; + + if ((void *) eth + sizeof(*eth) > data_end) { + return TC_ACT_OK; + } + + if(eth->h_proto != ETH_P_IP && eth->h_proto != ETH_P_IPV6) { + return TC_ACT_OK; + } + + struct iphdr *iph = data + sizeof(*eth); + if ((void *) iph + sizeof(*iph) > data_end) { + return TC_ACT_OK; + } + + /* do not support fragmented packets as L4 headers may be missing */ + if (iph->frag_off & IP_FRAGMENTED) { + return TC_ACT_OK; + } + + if (iph->protocol != IPPROTO_TCP) { + bpf_printk("Packet's not TCP - forwarding"); + return TC_ACT_OK; + } + + struct tcphdr *tcp = (void *) iph + sizeof(*iph); + if ((void *) tcp + sizeof(*tcp) > data_end) { + return TC_ACT_SHOT; + } + + struct two_tuple dst = { + .ip = iph->daddr, + .port = tcp->dest + }; + + struct two_tuple *orig_src = bpf_map_lookup_elem(&conn_track, &dst); + + if (orig_src == NULL) { + bpf_printk("No translation found - pass it through"); + return TC_ACT_OK; + } + + bpf_printk("Restore original source IP"); + + iph->saddr = orig_src->ip; + tcp->source = orig_src->port; + + iph->tos = 7 << 2; + iph->check = 0; + iph->check = checksum((unsigned short *) iph, sizeof(struct iphdr)); + + return TC_ACT_OK; +}; + +SEC("classifier/ingress") +int ingress(struct __sk_buff *skb) { + bpf_printk("new packet captured on ingress (TC)"); + void *data = (void *) (long) skb->data; + void *data_end = (void *) (long) skb->data_end; + + struct ethhdr *eth = data; + + if ((void *) eth + sizeof(*eth) > data_end) { + return TC_ACT_OK; + } + + struct iphdr *iph = data + sizeof(*eth); + if ((void *) iph + sizeof(*iph) > data_end) { + return TC_ACT_OK; + } + + /* do not support fragmented packets as L4 headers may be missing */ + if (iph->frag_off & IP_FRAGMENTED) { + return TC_ACT_OK; + } + + if (iph->protocol != IPPROTO_TCP) { + bpf_printk("Packet's not TCP - forwarding"); + return TC_ACT_OK; + } + + if (iph->daddr == 16845322) { + bpf_printk("We're the destination - don't touch it"); + return TC_ACT_OK; + } + + struct tcphdr *tcp = (void *) iph + sizeof(*iph); + if ((void *) tcp + sizeof(*tcp) > data_end) { + return TC_ACT_SHOT; + } + + struct two_tuple src = { + .ip = iph->saddr, + .port = tcp->source + }; + + struct two_tuple dst = { + .ip = iph->daddr, + .port = tcp->dest + }; + + bpf_map_update_elem(&conn_track, &src, &dst, 0); + + bpf_printk("Forward packet to localhost (TC)"); + iph->daddr = 16845322; + iph->tos = 7 << 2; + iph->check = 0; + iph->check = checksum((unsigned short *) iph, sizeof(struct iphdr)); + return TC_ACT_OK; +}; + +static inline enum xdp_action extract_meta(struct xdp_md *ctx, struct observed_packet *pkt) { + void *data = (void *) (long) ctx->data; + void *data_end = (void *) (long) ctx->data_end; + struct ethhdr *eth = data; + __u16 proto; + + if (data + sizeof(struct ethhdr) > data_end) { + bpf_printk("Packet apparently not ethernet"); + return XDP_DROP; + } + + proto = eth->h_proto; + if (proto != bpf_htons(ETH_P_IP) && proto != bpf_htons(ETH_P_IPV6)) { + bpf_printk("Not an IP packet"); + return XDP_PASS; + } + + struct iphdr *iph = data + sizeof(*eth); + if ((void *) iph + sizeof(struct iphdr) > data_end) { + return XDP_DROP; + } + + /* do not support fragmented packets as L4 headers may be missing */ + if (iph->frag_off & IP_FRAGMENTED) { + return XDP_DROP; + } + + pkt->sourceIp = iph->saddr; + pkt->destIp = iph->daddr; + + __u8 ip_proto = iph->protocol; + + if (ip_proto == IPPROTO_TCP) { + struct tcphdr *tcph = extract_tcp_meta(pkt, (void *) iph, sizeof(struct iphdr), data_end); + // if ACK flag is set we just pass it through because it belongs to an already established connection + if (tcph == NULL || tcph->ack) { + return XDP_PASS; + } + } else if (ip_proto == IPPROTO_UDP) { + struct udphdr *udph = extract_udp_meta(pkt, (void *) iph, sizeof(struct iphdr), data_end); + // could also check if we're the source + if (udph == NULL) { + return XDP_PASS; + } + } + + return XDP_PASS; +} + +SEC("xdp/perf") +int xdp_ingress_perf(struct xdp_md *ctx) { + struct observed_packet pkt; + + enum xdp_action action = extract_meta(ctx, &pkt); + + if (pkt.destIp == 0 || pkt.sourceIp == 0) { + return action; + } + + if (!bpf_perf_event_output(ctx, &perf_observed_packets, BPF_F_CURRENT_CPU, &pkt, sizeof(struct observed_packet))) { + bpf_printk("Failed to submit observed packet"); + } + + return XDP_PASS; +} + +SEC("xdp/ring") +int xdp_ingress_ring(struct xdp_md *ctx) { + struct observed_packet pkt = {}; + + enum xdp_action action = extract_meta(ctx, &pkt); + + if (pkt.destIp == 0 || pkt.sourceIp == 0) { + return action; + } + + bpf_ringbuf_output(&ring_observed_packets, &pkt, sizeof(pkt), 0); + + return XDP_PASS; +} \ No newline at end of file diff --git a/code/ebpf-xdp-tc/ebpf/types.h b/code/ebpf-xdp-tc/ebpf/types.h new file mode 100644 index 0000000..ae4ac08 --- /dev/null +++ b/code/ebpf-xdp-tc/ebpf/types.h @@ -0,0 +1,17 @@ +struct observed_packet { + __u32 sourceIp; + __u32 destIp; + __u16 sourcePort; + __u16 destPort; + enum { + TCP, + UDP + } transport_proto; +}; + + +struct two_tuple { + __u32 ip; + __u16 port; + __u16 _pad; +}; \ No newline at end of file diff --git a/code/ebpf-xdp-tc/go.mod b/code/ebpf-xdp-tc/go.mod new file mode 100644 index 0000000..1a2dfda --- /dev/null +++ b/code/ebpf-xdp-tc/go.mod @@ -0,0 +1,25 @@ +module ebpf-xdp-tc + +go 1.17 + +require github.com/DataDog/ebpf-manager v1.0.3 + +require ( + github.com/DataDog/gopsutil v0.0.0-20200624212600-1b53412ef321 // indirect + github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 // indirect + github.com/avast/retry-go v3.0.0+incompatible // indirect + github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 // indirect + github.com/cilium/ebpf v0.6.3-0.20210917122031-fc2955d2ecee // indirect + github.com/florianl/go-tc v0.3.0 // indirect + github.com/go-ole/go-ole v1.2.4 // indirect + github.com/google/go-cmp v0.5.4 // indirect + github.com/hashicorp/errwrap v1.0.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/josharian/native v0.0.0-20200817173448-b6b71def0850 // indirect + github.com/mdlayher/netlink v1.4.0 // indirect + github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect + github.com/vishvananda/netlink v1.1.0 // indirect + github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect + golang.org/x/net v0.0.0-20210119194325-5f4716e94777 // indirect + golang.org/x/sys v0.0.0-20210921065528-437939a70204 // indirect +) diff --git a/code/ebpf-xdp-tc/go.sum b/code/ebpf-xdp-tc/go.sum new file mode 100644 index 0000000..2a07c3e --- /dev/null +++ b/code/ebpf-xdp-tc/go.sum @@ -0,0 +1,114 @@ +github.com/DataDog/ebpf-manager v1.0.3 h1:zeuFyHmP4/m8uqx7LyLHkHKbmrDjKkk34bz324tBOlc= +github.com/DataDog/ebpf-manager v1.0.3/go.mod h1:05Y9FhEyILUdCovBthi5y4KPY8AfUg5EbMNC6RMQXDY= +github.com/DataDog/gopsutil v0.0.0-20200624212600-1b53412ef321 h1:OPAXA+r6yznoxWR5jQ2iTh5CvzIMrdw8AU0uFN2RwEw= +github.com/DataDog/gopsutil v0.0.0-20200624212600-1b53412ef321/go.mod h1:tGQp6XG4XpOyy67WG/YWXVxzOY6LejK35e8KcQhtRIQ= +github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705 h1:UUppSQnhf4Yc6xGxSkoQpPhb7RVzuv5Nb1mwJ5VId9s= +github.com/StackExchange/wmi v0.0.0-20181212234831-e0a55b97c705/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= +github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= +github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= +github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 h1:kHaBemcxl8o/pQ5VM1c8PVE1PubbNx3mjUr09OqWGCs= +github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575/go.mod h1:9d6lWj8KzO/fd/NrVaLscBKmPigpZpn5YawRPw+e3Yo= +github.com/cilium/ebpf v0.6.3-0.20210917122031-fc2955d2ecee h1:eg3Xm5uBYJLRDVq750EFFx9CHTOEFIH/MjLNNpyTS3Y= +github.com/cilium/ebpf v0.6.3-0.20210917122031-fc2955d2ecee/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/florianl/go-tc v0.3.0 h1:qeqQB5kp2lwJP1p/8krLQIuRfkHWpiPPcYr3rhRSaC8= +github.com/florianl/go-tc v0.3.0/go.mod h1:Ni/GTSK8ymDnsRQfL2meJeGmcXy7RFIvchiVHizU76U= +github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= +github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/go-ole/go-ole v1.2.4 h1:nNBDSCOigTSiarFpYE9J/KtEA1IOW4CNeqT9TQDqCxI= +github.com/go-ole/go-ole v1.2.4/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= +github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/josharian/native v0.0.0-20200817173448-b6b71def0850 h1:uhL5Gw7BINiiPAo24A2sxkcDI0Jt/sqp1v5xQCniEFA= +github.com/josharian/native v0.0.0-20200817173448-b6b71def0850/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= +github.com/jsimonetti/rtnetlink v0.0.0-20190606172950-9527aa82566a/go.mod h1:Oz+70psSo5OFh8DBl0Zv2ACw7Esh6pPUphlvZG9x7uw= +github.com/jsimonetti/rtnetlink v0.0.0-20200117123717-f846d4f6c1f4/go.mod h1:WGuG/smIU4J/54PblvSbh+xvCZmpJnFgr3ds6Z55XMQ= +github.com/jsimonetti/rtnetlink v0.0.0-20201009170750-9c6f07d100c1/go.mod h1:hqoO/u39cqLeBLebZ8fWdE96O7FxrAsRYhnVOdgHxok= +github.com/jsimonetti/rtnetlink v0.0.0-20201216134343-bde56ed16391/go.mod h1:cR77jAZG3Y3bsb8hF6fHJbFoyFukLFOkQ98S0pQz3xw= +github.com/jsimonetti/rtnetlink v0.0.0-20201220180245-69540ac93943/go.mod h1:z4c53zj6Eex712ROyh8WI0ihysb5j2ROyV42iNogmAs= +github.com/jsimonetti/rtnetlink v0.0.0-20210122163228-8d122574c736/go.mod h1:ZXpIyOK59ZnN7J0BV99cZUPmsqDRZ3eq5X+st7u/oSA= +github.com/jsimonetti/rtnetlink v0.0.0-20210212075122-66c871082f2b h1:c3NTyLNozICy8B4mlMXemD3z/gXgQzVXZS/HqT+i3do= +github.com/jsimonetti/rtnetlink v0.0.0-20210212075122-66c871082f2b/go.mod h1:8w9Rh8m+aHZIG69YPGGem1i5VzoyRC8nw2kA8B+ik5U= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mdlayher/ethtool v0.0.0-20210210192532-2b88debcdd43 h1:WgyLFv10Ov49JAQI/ZLUkCZ7VJS3r74hwFIGXJsgZlY= +github.com/mdlayher/ethtool v0.0.0-20210210192532-2b88debcdd43/go.mod h1:+t7E0lkKfbBsebllff1xdTmyJt8lH37niI6kwFk9OTo= +github.com/mdlayher/genetlink v1.0.0 h1:OoHN1OdyEIkScEmRgxLEe2M9U8ClMytqA5niynLtfj0= +github.com/mdlayher/genetlink v1.0.0/go.mod h1:0rJ0h4itni50A86M2kHcgS85ttZazNt7a8H2a2cw0Gc= +github.com/mdlayher/netlink v0.0.0-20190409211403-11939a169225/go.mod h1:eQB3mZE4aiYnlUsyGGCOpPETfdQq4Jhsgf1fk3cwQaA= +github.com/mdlayher/netlink v1.0.0/go.mod h1:KxeJAFOFLG6AjpyDkQ/iIhxygIUKD+vcwqcnu43w/+M= +github.com/mdlayher/netlink v1.1.0/go.mod h1:H4WCitaheIsdF9yOYu8CFmCgQthAPIWZmcKp9uZHgmY= +github.com/mdlayher/netlink v1.1.1/go.mod h1:WTYpFb/WTvlRJAyKhZL5/uy69TDDpHHu2VZmb2XgV7o= +github.com/mdlayher/netlink v1.2.0/go.mod h1:kwVW1io0AZy9A1E2YYgaD4Cj+C+GPkU6klXCMzIJ9p8= +github.com/mdlayher/netlink v1.2.1/go.mod h1:bacnNlfhqHqqLo4WsYeXSqfyXkInQ9JneWI68v1KwSU= +github.com/mdlayher/netlink v1.2.2-0.20210123213345-5cc92139ae3e/go.mod h1:bacnNlfhqHqqLo4WsYeXSqfyXkInQ9JneWI68v1KwSU= +github.com/mdlayher/netlink v1.3.0/go.mod h1:xK/BssKuwcRXHrtN04UBkwQ6dY9VviGGuriDdoPSWys= +github.com/mdlayher/netlink v1.4.0 h1:n3ARR+Fm0dDv37dj5wSWZXDKcy+U0zwcXS3zKMnSiT0= +github.com/mdlayher/netlink v1.4.0/go.mod h1:dRJi5IABcZpBD2A3D0Mv/AiX8I9uDEu5oGkAVrekmf8= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U= +github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= +github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= +github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20201216054612-986b41b23924/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210119194325-5f4716e94777 h1:003p0dJM77cxMSyCPFphvZf/Y5/NXf5fzg6ufd1/Oew= +golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190411185658-b44545bcd369/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201009025420-dfb3f7c4e634/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201118182958-a01c418693c7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201218084310-7d0127a74742/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210110051926-789bb1bd4061/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210123111255-9b0068b26619/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210216163648-f7da38b97c65/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210921065528-437939a70204 h1:JJhkWtBuTQKyz2bd5WG9H8iUsJRU3En/KRfN8B2RnDs= +golang.org/x/sys v0.0.0-20210921065528-437939a70204/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/code/ebpf-xdp-tc/main.go b/code/ebpf-xdp-tc/main.go new file mode 100644 index 0000000..df016fe --- /dev/null +++ b/code/ebpf-xdp-tc/main.go @@ -0,0 +1,190 @@ +package main + +import ( + "bytes" + "context" + _ "embed" + "errors" + "fmt" + "log" + "net/http" + "os" + "os/signal" + + manager "github.com/DataDog/ebpf-manager" + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/features" +) + +var ( + //go:embed ebpf/bin/probe.o + ebpfTCProgram []byte + + xdpMonitorPerf = &manager.Probe{ + ProbeIdentificationPair: manager.ProbeIdentificationPair{ + EBPFSection: "xdp/perf", + EBPFFuncName: "xdp_ingress_perf", + }, + Ifname: "eth0", + NetworkDirection: manager.Ingress, + } + + xdpMonitorRing = &manager.Probe{ + ProbeIdentificationPair: manager.ProbeIdentificationPair{ + EBPFSection: "xdp/ring", + EBPFFuncName: "xdp_ingress_ring", + }, + Ifname: "eth0", + NetworkDirection: manager.Ingress, + } + + mgr = &manager.Manager{ + Probes: []*manager.Probe{ + { + ProbeIdentificationPair: manager.ProbeIdentificationPair{ + EBPFSection: "classifier/egress", + EBPFFuncName: "egress", + }, + Ifname: "eth0", + NetworkDirection: manager.Egress, + }, + { + ProbeIdentificationPair: manager.ProbeIdentificationPair{ + EBPFSection: "classifier/ingress", + EBPFFuncName: "ingress", + }, + Ifname: "eth0", + NetworkDirection: manager.Ingress, + }, + }, + } +) + +type ( + packetMonitorMode uint8 + packetReader interface { + Read() (*Packet, error) + Close() error + } +) + +const ( + packetMonitorModeRing packetMonitorMode = iota + packetMonitorModePerfEvent +) + +func main() { + var monitorMode packetMonitorMode + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill) + defer cancel() + + mgrOpts := manager.Options{ + ExcludedFunctions: nil, + RLimit: nil, + } + + if err := features.HaveMapType(ebpf.RingBuf); err != nil { + if errors.Is(err, ebpf.ErrNotSupported) { + log.Println("Falling back to perf event reader") + mgr.Probes = append(mgr.Probes, xdpMonitorPerf) + monitorMode = packetMonitorModePerfEvent + mgrOpts.ExcludedFunctions = append(mgrOpts.ExcludedFunctions, xdpMonitorRing.EBPFFuncName) + } else { + log.Fatalf("God knows what happened: %v\n", err) + } + } else { + log.Println("Using fancy new ringbuf reader") + mgr.Probes = append(mgr.Probes, xdpMonitorRing) + monitorMode = packetMonitorModeRing + mgrOpts.ExcludedFunctions = append(mgrOpts.ExcludedFunctions, xdpMonitorPerf.EBPFFuncName) + } + + if err := mgr.InitWithOptions(bytes.NewReader(ebpfTCProgram), mgrOpts); err != nil { + log.Fatalf("Failed to init manager: %v", err) + } + + if err := mgr.Start(); err != nil { + log.Fatalf("Failed to start manager: %v", err) + } + + runHTTPServer() + + var reader packetReader + switch monitorMode { + case packetMonitorModeRing: + if r, err := createRingBufReader(mgr); err != nil { + log.Fatalf("Failed to create rinbuf reader: %v\n", err) + } else { + reader = r + } + case packetMonitorModePerfEvent: + if r, err := createPerfEventReader(mgr); err != nil { + log.Fatalf("Failed to create perf_event reader: %v\n", err) + } else { + reader = r + } + } + + go logEventsFromReader(ctx, reader) + + <-ctx.Done() + + if err := mgr.Stop(manager.CleanAll); err != nil { + log.Fatalf("Failed to stop manager: %v", err) + } +} + +func logEventsFromReader(ctx context.Context, reader packetReader) { + log.Println("Waiting for received packets") + defer func() { + if err := reader.Close(); err != nil { + log.Fatalf("Failed to close reader: %v\n", err) + } + }() + for ctx.Err() == nil { + if pkt, err := reader.Read(); err != nil { + log.Printf("Error occurred while reading packet: %v\n", err) + } else { + log.Println(pkt) + } + } +} + +func createRingBufReader(mgr *manager.Manager) (packetReader, error) { + if m, present, err := mgr.GetMap("ring_observed_packets"); err != nil { + return nil, err + } else if !present { + return nil, fmt.Errorf("ring_observed_packets map not loaded") + } else { + return NewRingBufReader(m) + } + +} + +func createPerfEventReader(mgr *manager.Manager) (packetReader, error) { + if m, present, err := mgr.GetMap("perf_observed_packets"); err != nil { + return nil, err + } else if !present { + return nil, errors.New("perf_observed_packets map not loaded") + } else { + return NewPerfEventReader(m, 8) + } +} + +func runHTTPServer() { + log.Println("Listening on: 0.0.0.0:80") + go func() { + err := http.ListenAndServe("0.0.0.0:80", http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) { + log.Println("Handling request") + writer.WriteHeader(200) + _, _ = writer.Write([]byte("Hello, world!")) + })) + + if err != nil { + if errors.Is(err, http.ErrServerClosed) { + return + } + log.Printf("Error serving HTTP: %v", err) + } + }() +} diff --git a/code/ebpf-xdp-tc/perf_event.go b/code/ebpf-xdp-tc/perf_event.go new file mode 100644 index 0000000..d248e1b --- /dev/null +++ b/code/ebpf-xdp-tc/perf_event.go @@ -0,0 +1,39 @@ +package main + +import ( + "bytes" + "encoding/binary" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/perf" +) + +func NewPerfEventReader(m *ebpf.Map, perCPUBufferSize int) (reader *PerfEventReader, err error) { + reader = new(PerfEventReader) + if reader.reader, err = perf.NewReader(m, perCPUBufferSize); err != nil { + return nil, err + } else { + return reader, nil + } +} + +type PerfEventReader struct { + reader *perf.Reader +} + +func (r *PerfEventReader) Read() (*Packet, error) { + var pkt observedPacket + if rec, err := r.reader.Read(); err != nil { + return nil, err + } else { + if err = binary.Read(bytes.NewReader(rec.RawSample), binary.LittleEndian, &pkt); err != nil { + return nil, err + } else { + return pkt.ToPacket(), nil + } + } +} + +func (r *PerfEventReader) Close() error { + return r.reader.Close() +} diff --git a/code/ebpf-xdp-tc/ringbuf.go b/code/ebpf-xdp-tc/ringbuf.go new file mode 100644 index 0000000..49edd47 --- /dev/null +++ b/code/ebpf-xdp-tc/ringbuf.go @@ -0,0 +1,39 @@ +package main + +import ( + "bytes" + "encoding/binary" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/ringbuf" +) + +func NewRingBufReader(m *ebpf.Map) (reader *RingBufReader, err error) { + reader = new(RingBufReader) + if reader.reader, err = ringbuf.NewReader(m); err != nil { + return nil, err + } else { + return reader, nil + } +} + +type RingBufReader struct { + reader *ringbuf.Reader +} + +func (r *RingBufReader) Read() (*Packet, error) { + var pkt observedPacket + if rec, err := r.reader.Read(); err != nil { + return nil, err + } else { + if err = binary.Read(bytes.NewReader(rec.RawSample), binary.LittleEndian, &pkt); err != nil { + return nil, err + } else { + return pkt.ToPacket(), nil + } + } +} + +func (r *RingBufReader) Close() error { + return r.reader.Close() +} diff --git a/code/ebpf-xdp-tc/types.go b/code/ebpf-xdp-tc/types.go new file mode 100644 index 0000000..026f4e4 --- /dev/null +++ b/code/ebpf-xdp-tc/types.go @@ -0,0 +1,58 @@ +package main + +import ( + "encoding/binary" + "net" +) + +type Protocol uint8 + +const ( + ProtocolTCP Protocol = iota + ProtocolUDP +) + +type Packet struct { + SourceIP net.IP + DestIP net.IP + SourcePort uint16 + DestPort uint16 + Transport Protocol +} + +type observedPacket struct { + SourceIP uint32 + DestIP uint32 + SourcePort uint16 + DestPort uint16 + Transport Protocol +} + +func (p *observedPacket) ToPacket() *Packet { + return &Packet{ + SourceIP: int2ip(p.SourceIP), + DestIP: int2ip(p.DestIP), + SourcePort: p.SourcePort, + DestPort: p.DestPort, + Transport: p.Transport, + } +} + +func int2ip(nn uint32) net.IP { + ip := make(net.IP, net.IPv4len) + binary.LittleEndian.PutUint32(ip, nn) + return ip +} + +func ip2int(ip net.IP) uint32 { + b := make([]byte, net.IPv4len) + copy(b, ip.To4()) + reverse(b) + return binary.BigEndian.Uint32(b) +} + +func reverse(input []byte) { + for i := 0; i < len(input)/2; i++ { + input[i], input[len(input)-1-i] = input[len(input)-1-i], input[i] + } +} diff --git a/code/libvirt-podman-network-mesh/containers.xml b/code/libvirt-podman-network-mesh/containers.xml index 03ff291..4c2a884 100644 --- a/code/libvirt-podman-network-mesh/containers.xml +++ b/code/libvirt-podman-network-mesh/containers.xml @@ -2,7 +2,7 @@ containers 929b7b7d-bd82-452d-96b7-12f0cf1a4b17 - + diff --git a/config.toml b/config.toml index 5571884..49bb306 100644 --- a/config.toml +++ b/config.toml @@ -13,6 +13,7 @@ author = "Peter Kurfer" style= "auto" copyCodeButton = true rssAsSocialIcon = true +custom_css = ["css/custom.css"] [[menu.main]] identifier = "about" diff --git a/content/post/go-ebpf-tc.md b/content/post/go-ebpf-tc.md new file mode 100644 index 0000000..6b7288f --- /dev/null +++ b/content/post/go-ebpf-tc.md @@ -0,0 +1,334 @@ ++++ +author = "Peter Kurfer" +title = "eBPF traffic control a.k.a tc with Go" +date = "2022-02-24" +description = "Build your own DNAT with eBPF traffic control and Go" +tags = [ + "golang", + "ebpf", + "tc" +] ++++ + +While working on my 'pet project' [INetMock](https://gitlab.com/inetmock/inetmock) I realized early that it would be amazing to be able to monitor traffic that 'flows by' to see not only traffic I could actually handle but also to get some information which high ports were requested. +Unfortunately back then I was only aware of PCAP capturing and it seemed rather complicated to implement what I wanted on top of PCAPs - or at least rather computation intense. + +## eBPF for the rescue + +At work I'm part of the platform team maintaining a bunch of Kubernetes clusters and this is where I first stumbled upon eBPF when we migrated from Calico to Cilium. +So you might ask what is eBPF?! +Very short answer: [RTFM](https://ebpf.io/what-is-ebpf) :nerd: + +eBPF is the abbreviation for 'Extended Berkeley Packet Filter'. +While originally intended to be used for network traffic analysis it got some major upgrades over the past years so that you can now: + +* monitor/control syscalls like [Falco](https://sysdig.com/opensource/falco/) is doing it +* implement a very mighty [keylogger](https://arighi.blogspot.com/2018/12/linux-easy-keylogger-with-ebpf.html) +* [build DDoS attack prevention systems](https://blog.cloudflare.com/l4drop-xdp-ebpf-based-ddos-mitigations/) +* [L4 load balancers](https://blog.cloudflare.com/unimog-cloudflares-edge-load-balancer/) +* [Making a firewall using eBPFs and cgroups](https://nfil.dev/coding/security/ebpf-firewall-with-cgroups/) +* ... + +After some initial research I figured using XDP (eXpress Data Path) would be best to implement the kind of monitoring I had in mind. +XDP is also used by Cloudflare for their DDoS prevention system and for their load balancer hence what's good enough for them could be good enough for me (irony intended). +So I was looking for some 'examples' a.k.a. code I can copy and adopt to get started quickly. +I stumbled upon a project monitoring that monitored the API server traffic in a Kubernetes cluster and this was a lot like what I wanted to do! +At least it looked like it at this point. +It took me a few evening to get my prototype working and I read a lot more about the topic whenever possible. + +Coincidentally we had a firewall/network misconfiguration issue at work at the same time when I tried to get my first prototype live in the staging environment and it hit me: + why not using ebpf/XDP not only for monitoring but also to get rid of `iptables` and build my own 'firewall'! :heart_eyes: +Don't misunderstand me, `iptables` is a perfectly fine and perfectly working solution and it served me well! +But it's always a bit annoying when I start the server and forgot to run the ominous `iptables` command I originally copied from the _INetSim_ docs that takes care of the DNATing and suddenly nothing works. + +## 1st approach: XDP + +To be honest: I used and configured both SNAT and DNAT often in the past and I had a basic idea how this all works but...this 'tiny bit of knowledge' rapidly crumbled into nothing as soon as I tried to build it myself. + +The idea is simple. +In general the communication of a client with an external server looks like this: + +{{}} +%%{init: {'theme': 'neutral'}}%% +flowchart LR + client([Client]) --> |IP packet|router{{Router}} + router --> |IP packet|srv>Server] + srv -->|response|router + router-->|esponse|client +{{< /mermaid>}} + +(Ignoring everything that is actually required like DNS, TCP handshakes,...) + +What I wanted to achieve looks like this: + +{{}} +%%{init: {'theme': 'neutral'}}%% +flowchart LR + client([Client]) --> |IP packet|router{{Router}} + router --> |modifiedIP packet|router + router-->|faked response|client +{{< /mermaid>}} + +I already knew it's possible and even intended to re-route packets with XDP because that's exactly what Cloudflare is doing with Unimog. +I tried to find some more examples on how to do redirect traffic and at this point I realized how much I **did/do** not know: + +* Do I need to update any checksums? +* If I only modify the IP header, do I also need to fix TCP/UDP checksums? +* Is modifying the IP enough or do I also need to modify the destination MAC address of the ethernet packet? + +And I bet there's even more I still haven't even thought about... + +It's kind of hart to admit but it took me actually a lot of time to realize why my XDP approach wasn't working and will not ever work in the setup I have in mind. +One disadvantage of diving directly into the code and focusing on examples is: you've no idea what you're actually doing! +XDP is so great because depending on your hardware it is even possible to execute the program directly on the NIC but one huge disadvantage (for me) is, that it only captures **_ingress_** traffic! + +So I perfectly screwed up because my whole traffic was within a single network segment and looked like in the following diagram. +I'm now using a sequence diagram to make things a bit more explicit. + +Assuming the following network configuration: + +{{< table "center" >}} +| Actor | IP | +| ------ | -------------- | +| Client | 192.168.178.51 | +| Router | 192.168.178.1 | +{{}} + +{{}} +%%{init: {'theme': 'neutral'}}%% +sequenceDiagram + Note over Client,Router: .51:34578 → 1.2.3.4:80 + Client->>Router: TCP SYN + Note over Client,Router: .51:34578 → 192.168.178.1:80 + Router->>Router: Redirect to 192.168.178.1:80 + Note over Client,Router: .51:34578 ← .1:80 + Router->>Client: SYN-ACK +{{< /mermaid>}} + +But the `Client` didn't try to connect to `192.168.178.1` hence it won't accept the packet. +While this is kind of obvious it took me quite some time to get my head around this. +Not only but also because it's hard to observe this if you're using XDP because XDP forwarded/modified packets are not included in a PCAP. +Fortunately there's [`xdp-dump`](https://github.com/xdp-project/xdp-tools) to get a better understanding what's going on. + +Okay, so I've to maintain a mapping of the original 4-tuple to my modified one to be able to restore the original source after the network stack handled the packet. +eBPF has a few different map types to store data between program invocations (I'll cover that later) so this wasn't a problem. +But now it hit me, with XDP I cannot manipulate egress (outgoing) packets. +So XDP's a dead end for this use case (although perfectly fine for the monitoring!). + +## What if XDP is not enough? + +Another round of 'research' revealed there are even more points in the network stack already where I could attach eBPF programs. +Every trace point has slightly different options (and therefore make sense to be there). + +There are: + +* `BPF_PROG_TYPE_XDP`: earliest possible point to get your hands on ingress traffic, can be used to monitor (and pass) incoming traffic, drop or redirect traffic +* `BPF_PROG_TYPE_SOCKET_FILTER`: drop or trim packets at socket level (much later than with XDP) +* `BPF_PROG_TYPE_CGROUP_SOCK`: much like XDP but within network cgroups +* ... + +_A full list of program types can be found in [include/uapi/linux/bpf.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h#L920) in the Linux kernel source. +A good introduction into the different types of eBPF programs can be found [here](https://blogs.oracle.com/linux/post/bpf-a-tour-of-program-types) (can't believe I'm linking an Oracle document). + +All of the aforementioned options have in common that they can be used to filter packets but what I needed was an option to _modify egress_ packets. +AFAIK the only available option for this is `tc` (a.k.a. traffic control) which is Linux' QoS system. +Even though you get a lot of 'high level' information about what it is, that it supports (e)BPF and that there's the `tc` CLI to interact with it - also lot's of examples how to use the CLI - I could barely find a library/documentation about the API to not requiring shell outs. +Finally I found DataDog's [ebpf-manager](https://github.com/DataDog/ebpf-manager/) which uses Cilium's [ebpf](https://github.com/cilium/ebpf) and [go-tc](https://github.com/florianl/go-tc) to attach eBPF programs to the `tc` subsystem. +Actually not only this it also comes with a pretty handy manager layer to make working with eBPF a charm. + +## `tc` in action + +From now I'd like to dig a bit into the source code - all sources can be found [in the repo](https://github.com/baez90/baez90.github.io) under `code/ebpf-xdp-tc`. +The setup for experimenting looks like this: + +{{}} +%%{init: {'theme': 'neutral'}}%% +flowchart LR + subgraph libvirtNet [isolated Libvirt network] + vm["Windows VM"] --> container["Podman container"] + end +{{< /mermaid>}} + +and is based on my post on how to [join a Podman container to a Libvirt network]({{< relref "libvirt-podman-network-mesh.md" >}}). + +The resulting workflow should be looking like so: + +{{}} +%%{init: {'theme': 'neutral'}}%% +sequenceDiagram + Client->>Ingress: IP packet + Ingress->>Ingress: Rewrite packet and store original destination + Ingress->>Server: Forward packet + Server->>Egress: Intercept response packet + Egress->>Egress: Restore original destination as source + Egress->>Client: Forward packet +{{< /mermaid>}} + +So in short words: the client sends an IP packet to an IP outside of the local network hence it will be sent to the gateway (which happens to be my Podman container). +The eBPF program attached to the `tc` ingress hook takes the incoming packet, rewrites it's destination to the local IP of the Podman container and passes the modified packet on. +In my experiment I'm using a simple HTTP server to respond to every HTTP request with a plain text message and a status code `200`. +The network stack processes the packet and replies e.g. with a `SYN-ACK` to the client's IP address. +The eBPF program attached to the `tc` egress hook intercepts the response, restores the original source source IP based on the client's IP address and TCP/UDP port and forwards the packet. + +### Ingress traffic + +Every eBPF program needs a section identifier and has to fulfill some constraints to pass the verifier. +Currently this means for instance that loops are not allowed to ensure the program has a guaranteed end. + +The simplest ingress hook would look like this: + +```c +#include +#include + +SEC("classifier/ingress") +int ingress(struct __sk_buff *skb) { + return TC_ACT_SHOT; +} +``` + +This program would simply drop every incoming packet but it's a valid program. +The parameter - in this case `struct __sk_buff *skb` - depends on the trace point. +The `skb` is the most powerful one and in theory it's even possible to extract HTTP parameters out of it. +See for example [this article](http://vger.kernel.org/~davem/skb_data.html) for further details. +XDP programs are receiving another parameter that is a bit more generic but also less expensive to initialize. +Either way, you can easily (and rather cheap) extract the different parts of the IP packet just with a few lines of code. + +A quick reminder how an IP packet looks like: + +{{< figure + src="https://upload.wikimedia.org/wikipedia/commons/3/3b/UDP_encapsulation.svg" + link="https://commons.wikimedia.org/wiki/File:UDP_encapsulation.svg" + target="__blank" + caption="en:User:Cburnett original work, colorization by en:User:Kbrose, [CC BY-SA 3.0](http://creativecommons.org/licenses/by-sa/3.0/), via Wikimedia Commons" + title="UDP encapsulation" +>}} + +In my simplified case I assume the `Frame header` will be a `struct ethhdr`, followed by the `struct iphdr` and then either a `struct udphdr` or a `struct tcphdr`. + +To satisfy the verifier you've to validate after every cast that you haven't reached already the end of the current `skb` to ensure memory safety which is both: a bit annoying and a lot calming because memory issues are avoided right away. +So assuming we just want to 'print' the source and destination address of every packet reaching our ingress hook we would do the following: + + +```c +#include +#include +#include +#include +#include + +SEC("classifier/ingress") +int ingress(struct __sk_buff *skb) { + void *data = (void *) (long) skb->data; + void *data_end = (void *) (long) skb->data_end; + + struct ethhdr *eth = data; + + // apparently not an ethernet packet + if ((void *) eth + sizeof(*eth) > data_end) { + return TC_ACT_OK; + } + + // ignore packet that are neither IPv4 nor IPv6 + if(eth->h_proto != ETH_P_IP && eth->h_proto != ETH_P_IPV6) { + return TC_ACT_OK; + } + + struct iphdr *iph = data + sizeof(*eth); + if ((void *) iph + sizeof(*iph) > data_end) { + return TC_ACT_OK; + } + + bpf_printk("Packet from %d to %d\n", iph->saddr, iph->daddr); + + return TC_ACT_OK; +} +``` + +That's already a bit more code, isn't it? +So we start by capturing the beginning and the end of the packet. +As mentioned previously it's possible to 'extract' the individual parts of the packet just by casting the right offsets. +Of course some additional sanity checks are necessary to make sure we don't misinterpret anything. +For instance if the current packet is not an IP packet but probably an ARP packet we just let it pass. +What's worth mentioned is the `bpf_printk` macro because it's particularly useful - although it's slower than other options but it's pretty easy to use for debugging. +To get the message we're sending with it you can simply do + +```sh +sudo cat /sys/kernel/debug/tracing/trace_pipe +``` + +and you're good to go! + +#### eBPF maps + +So remembering the sequence diagram above we are already almost finished regarding the parsing of the required information but how do we store the gather information? +eBPF comes with a bunch of different maps. +All types can be found in [include/uapi/linux/bpf.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h#L878) or with more details [here](https://prototype-kernel.readthedocs.io/en/latest/bpf/ebpf_maps_types.html). + +Some of them are equivalent to ordinary data structures most developers are used to like: + +* `BPF_MAP_TYPE_ARRAY` behaves like an ordinary array +* `BPF_MAP_TYPE_HASH` behaves like an ordinary map/dictionary + +but others like `BPF_MAP_TYPE_PERF_EVENT_ARRAY` or `BPF_MAP_TYPE_RINGBUF` are rather special. +Fow now we're focusing on `BPF_MAP_TYPE_HASH` because we can use it to store the `orig-src` → `orig-dest` mapping. +To store data in a map and load it later on eBPF exposes some [bpf-helpers](https://www.man7.org/linux/man-pages/man7/bpf-helpers.7.html): + +* `long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)` to store data in a map +* `void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)` to load data + +An extension of the previous program could look like so: + + +```c +/* +... includes +*/ + +#define IP_FRAGMENTED 65343 + +// source IP and port +struct two_tuple { + __u32 ip; + __u16 port; + __u16 _pad; // required to pad the size of the struct to a multiple of 4 +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, sizeof(struct two_tuple)); + __type(value, sizeof(struct two_tuple)); + __uint(max_entries, 1024); +} conn_track SEC(".maps"); + +SEC("classifier/ingress") +int ingress(struct __sk_buff *skb) { + // ... previous logic + + // do not support fragmented packets as L4 headers may be missing + if (iph->frag_off & IP_FRAGMENTED) { + return TC_ACT_OK; + } + + if (iph->protocol != IPPROTO_TCP) { + return TC_ACT_OK; + } + + struct tcphdr *tcp = (void *) iph + sizeof(*iph); + if ((void *) tcp + sizeof(*tcp) > data_end) { + return TC_ACT_SHOT; + } + + struct two_tuple src = { + .ip = iph->saddr, + .port = tcp->source + }; + + struct two_tuple dst = { + .ip = iph->daddr, + .port = tcp->dest + }; + + bpf_map_update_elem(&conn_track, &src, &dst, 0); +} +``` \ No newline at end of file diff --git a/layouts/partials/header-extra.html b/layouts/partials/header-extra.html index 0e22c9a..e4a4c1e 100644 --- a/layouts/partials/header-extra.html +++ b/layouts/partials/header-extra.html @@ -1 +1,4 @@ - \ No newline at end of file + +{{ range .Site.Params.custom_css -}} + +{{- end }} \ No newline at end of file diff --git a/layouts/shortcodes/table.html b/layouts/shortcodes/table.html new file mode 100644 index 0000000..c1991d9 --- /dev/null +++ b/layouts/shortcodes/table.html @@ -0,0 +1,6 @@ +{{ $htmlTable := .Inner | markdownify }} +{{ $class := .Get 0 }} +{{ $old := "" }} +{{ $new := printf "
" $class }} +{{ $htmlTable := replace $htmlTable $old $new }} +{{ $htmlTable | safeHTML }} \ No newline at end of file diff --git a/static/css/custom.css b/static/css/custom.css new file mode 100644 index 0000000..d47c800 --- /dev/null +++ b/static/css/custom.css @@ -0,0 +1,6 @@ +.center { + margin-left: auto; + margin-right: auto; + margin-top: 1.5em; + margin-bottom: 1.5em; +} \ No newline at end of file