feat: prepare ephemeral workers

This commit is contained in:
Peter 2023-01-21 18:25:18 +01:00
parent 4adf7139c6
commit 7697d2be65
Signed by: prskr
GPG key ID: C1DB5D2E8DB512F9
17 changed files with 701 additions and 51 deletions

1
.gitignore vendored
View file

@ -5,3 +5,4 @@
.terraform/ .terraform/
.vaultpw .vaultpw
.vscode/ .vscode/
.ssh/

View file

@ -0,0 +1,51 @@
ssh_authorized_keys:
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKfHZaI0F5GjAcrM8hjWqwMfULDkAZ2TOIBTQtRocg1F
hostname: ${host}
write_files:
- enconding: ""
content: |-
#!/bin/bash
write_log () {
local message="$1"
logger -t "run-cmd" "$message"
echo "$message"
}
write_log "Getting the service using eth0..."
ETH1=$(connmanctl services | awk '{ print $3 }' | while read -r s1; do connmanctl services $s1 | grep -q "eth1" && echo "$s1"; done)
write_log "eth1 is bound to: $ETH1"
write_log "Setting up manual net config..."
connmanctl config "$ETH1" --ipv4 manual ${node_ip} 255.255.254.0 172.23.2.10
connmanctl config "$ETH1" --domains k8s.local
connmanctl config "$ETH1" --ipv6 off
write_log "Restarting connman..."
service connman restart
write_log "$(connmanctl services $ETH1)"
write_log "Network setup done."
owner: root:root
path: /etc/run-cmd.sh
permissions: '0755'
run_cmd:
- "/etc/run-cmd.sh"
k3os:
data_sources:
- hetzner
- cdrom
modules:
- kvm
- nvme
dns_nameservers:
- 1.1.1.1
- 1.0.0.1
ntp_servers:
- ptbtime1.ptb.de
- ptbtime2.ptb.de
server_url: https://172.23.2.10:6443
password: rancher
token: "${k3s_token}"
labels:
region: ${datacenter}
k3s_args:
- agent
- --node-ip=${node_ip}
- --kubelet-arg=cloud-provider=external

View file

@ -40,7 +40,6 @@ resource "hcloud_server" "nodes" {
} }
} }
resource "hcloud_server_network" "k8s_internal" { resource "hcloud_server_network" "k8s_internal" {
for_each = var.vms for_each = var.vms

View file

@ -0,0 +1,86 @@
resource "tls_private_key" "provisioning" {
algorithm = "RSA"
rsa_bits = 4096
}
resource "hcloud_ssh_key" "provisioning_key" {
name = "Provisioning key for hcloud cluster"
public_key = tls_private_key.provisioning.public_key_openssh
}
resource "local_file" "provisioning_key" {
filename = "${path.module}/.ssh/provisioning_private_key.pem"
content = tls_private_key.provisioning.private_key_pem
directory_permission = "0700"
file_permission = "0400"
}
resource "local_file" "provisioning_key_pub" {
filename = "${path.module}/.ssh/provisioning_key.pub"
content = tls_private_key.provisioning.public_key_openssh
directory_permission = "0700"
file_permission = "0440"
}
resource "hcloud_server" "machine" {
for_each = var.k3os_workers
name = each.key
server_type = each.value.server_type
datacenter = "hel1-dc2"
image = "ubuntu-22.04"
backups = each.value.backups
ssh_keys = [
hcloud_ssh_key.provisioning_key.id,
hcloud_ssh_key.default.id
]
labels = {
"node_type" = each.value.node_type
"cluster" = "icb4dc0.de"
}
network {
network_id = hcloud_network.k8s_net.id
ip = each.value.private_ip
}
public_net {
ipv4_enabled = true
ipv6_enabled = false
}
# boot into rescue OS
rescue = "linux64"
connection {
host = self.ipv4_address
private_key = tls_private_key.provisioning.private_key_pem
timeout = "2m"
}
provisioner "file" {
content = templatefile(
"${path.module}/configs/k3os-config.yml",
{
"host" = "${each.key}"
"k3s_token" = "${var.k3s_token}"
"datacenter" = "hel1-dc2"
"node_ip" = "${each.value.private_ip}"
}
)
destination = "/tmp/k3os-config.yaml"
}
provisioner "remote-exec" {
inline = [
"set -ex",
"apt-get install -y grub-pc",
"cat /tmp/k3os-config.yaml",
"curl -fsSL --retry-delay 1 --retry 60 --retry-connrefused --retry-max-time 60 --connect-timeout 20 https://raw.githubusercontent.com/rancher/k3os/master/install.sh | bash -s -- --config /tmp/k3os-config.yaml /dev/sda https://github.com/rancher/k3os/releases/download/v0.21.5-k3s2r1/k3os-amd64.iso",
"reboot"
]
on_failure = continue
}
}

View file

@ -2,6 +2,10 @@ variable "hcloud_token" {
sensitive = true sensitive = true
} }
variable "k3s_token" {
sensitive = true
}
variable "vms" { variable "vms" {
type = map(object({ type = map(object({
node_type = string node_type = string
@ -11,6 +15,15 @@ variable "vms" {
})) }))
} }
variable "k3os_workers" {
type = map(object({
node_type = string
server_type = string
backups = bool
private_ip = string
}))
}
variable "ci_workers" { variable "ci_workers" {
type = map(object({ type = map(object({
node_type = string node_type = string

View file

@ -1,3 +1,12 @@
k3os_workers = {
"worker1-gen2" = {
backups = false
node_type = "worker"
server_type = "cpx11"
private_ip = "172.23.2.41"
}
}
vms = { vms = {
"cp1" = { "cp1" = {
node_type = "control-plane" node_type = "control-plane"

View file

@ -4,7 +4,7 @@
# - role: gateway-api # - role: gateway-api
- role: prometheus - role: prometheus
- role: postgres - role: postgres
- role: csi - role: hcloud
- role: minio - role: minio
- role: gitea - role: gitea
- role: concourse - role: concourse

View file

@ -1,8 +1,8 @@
all: all:
vars: vars:
ansible_user: root ansible_user: root
extra_server_args: "--node-taint=node-type=master:NoSchedule --tls-san='2a01:4f9:c012:7d4b::1' --tls-san='k8s.icb4dc0.de' --tls-san='127.0.0.1'" extra_server_args: "--node-taint=node-type=master:NoSchedule --tls-san='2a01:4f9:c012:7d4b::1' --tls-san='k8s.icb4dc0.de' --tls-san='127.0.0.1' --disable-cloud-controller --disable servicelb --kubelet-arg='--cloud-provider=external'"
extra_agent_args: "" extra_agent_args: "--kubelet-arg='--cloud-provider=external'"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no' ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
systemd_dir: /etc/systemd/system systemd_dir: /etc/systemd/system
master_ip: "172.23.2.10" master_ip: "172.23.2.10"

View file

@ -1,29 +0,0 @@
---
- name: Create Hcloud token secret
kubernetes.core.k8s:
state: present
definition:
apiVersion: v1
kind: Secret
metadata:
name: hcloud-csi
namespace: kube-system
data:
token: "{{ HcloudToken | b64encode }}"
- name: Create temporary file
ansible.builtin.tempfile:
state: file
suffix: temp
register: csi_manifest_tmp
- name: Download CSI manifest
ansible.builtin.get_url:
url: https://raw.githubusercontent.com/hetznercloud/csi-driver/v1.6.0/deploy/kubernetes/hcloud-csi.yml
dest: "{{ csi_manifest_tmp.path }}"
mode: '0664'
- name: Deploy CSI driver
kubernetes.core.k8s:
state: present
src: "{{ csi_manifest_tmp.path }}"

View file

@ -0,0 +1,44 @@
---
- name: Create Hcloud token secret
kubernetes.core.k8s:
state: present
definition:
apiVersion: v1
kind: Secret
metadata:
name: hcloud
namespace: kube-system
data:
token: "{{ HcloudToken | b64encode }}"
network: "{{ 'k8s-net' | b64encode }}"
- name: Deploy CSI driver
kubernetes.core.k8s:
state: present
definition: "{{ item }}"
loop: "{{ lookup('ansible.builtin.template', 'hcloud-csi.yml.j2') | ansible.builtin.from_yaml_all | list }}"
- name: Deploy cloud-controller-manager
kubernetes.core.k8s:
state: present
definition: "{{ item }}"
loop: "{{ lookup('ansible.builtin.template', 'cloud-controller-manager.yml.j2') | ansible.builtin.from_yaml_all | list }}"
- name: Create CSI controller PodMonitor
kubernetes.core.k8s:
state: present
definition:
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: hcloud-csi-controller
namespace: kube-system
labels:
prometheus: default
spec:
selector:
matchLabels:
app: hcloud-csi-controller
podMetricsEndpoints:
- port: metrics
path: /

View file

@ -0,0 +1,84 @@
# NOTE: this release was tested against kubernetes v1.18.x
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: cloud-controller-manager
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: system:cloud-controller-manager
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: cloud-controller-manager
namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: hcloud-cloud-controller-manager
namespace: kube-system
spec:
replicas: 1
revisionHistoryLimit: 2
selector:
matchLabels:
app: hcloud-cloud-controller-manager
template:
metadata:
labels:
app: hcloud-cloud-controller-manager
spec:
serviceAccountName: cloud-controller-manager
dnsPolicy: Default
tolerations:
# this taint is set by all kubelets running `--cloud-provider=external`
# so we should tolerate it to schedule the cloud controller manager
- key: "node.cloudprovider.kubernetes.io/uninitialized"
value: "true"
effect: "NoSchedule"
- key: "CriticalAddonsOnly"
operator: "Exists"
# cloud controller manages should be able to run on masters
- key: "node-role.kubernetes.io/master"
effect: NoSchedule
- key: "node-role.kubernetes.io/control-plane"
effect: NoSchedule
- key: "node.kubernetes.io/not-ready"
effect: "NoSchedule"
containers:
- image: hetznercloud/hcloud-cloud-controller-manager:v1.13.2
name: hcloud-cloud-controller-manager
command:
- "/bin/hcloud-cloud-controller-manager"
- "--cloud-provider=hcloud"
- "--leader-elect=false"
- "--allow-untagged-cloud"
- "--allocate-node-cidrs=false"
resources:
requests:
cpu: 100m
memory: 50Mi
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: HCLOUD_TOKEN
valueFrom:
secretKeyRef:
name: hcloud
key: token
- name: HCLOUD_NETWORK
valueFrom:
secretKeyRef:
name: hcloud
key: network
priorityClassName: system-cluster-critical

View file

@ -0,0 +1,392 @@
---
allowVolumeExpansion: true
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
annotations:
storageclass.kubernetes.io/is-default-class: "true"
name: hcloud-volumes
provisioner: csi.hetzner.cloud
volumeBindingMode: WaitForFirstConsumer
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: hcloud-csi-controller
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: hcloud-csi-controller
rules:
- apiGroups:
- ""
resources:
- persistentvolumes
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- csi.storage.k8s.io
resources:
- csinodeinfos
verbs:
- get
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- csinodes
verbs:
- get
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- volumeattachments
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- storage.k8s.io
resources:
- volumeattachments/status
verbs:
- patch
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- apiGroups:
- ""
resources:
- persistentvolumes
verbs:
- get
- list
- watch
- create
- delete
- patch
- apiGroups:
- ""
resources:
- persistentvolumeclaims
- persistentvolumeclaims/status
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- list
- watch
- create
- update
- patch
- apiGroups:
- snapshot.storage.k8s.io
resources:
- volumesnapshots
verbs:
- get
- list
- apiGroups:
- snapshot.storage.k8s.io
resources:
- volumesnapshotcontents
verbs:
- get
- list
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- get
- list
- watch
- create
- update
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: hcloud-csi-controller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: hcloud-csi-controller
subjects:
- kind: ServiceAccount
name: hcloud-csi-controller
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
labels:
app: hcloud-csi
name: hcloud-csi-controller-metrics
namespace: kube-system
spec:
ports:
- name: metrics
port: 9189
targetPort: metrics
selector:
app: hcloud-csi-controller
---
apiVersion: v1
kind: Service
metadata:
labels:
app: hcloud-csi
name: hcloud-csi-node-metrics
namespace: kube-system
spec:
ports:
- name: metrics
port: 9189
targetPort: metrics
selector:
app: hcloud-csi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: hcloud-csi-controller
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: hcloud-csi-controller
template:
metadata:
labels:
app: hcloud-csi-controller
spec:
containers:
- image: k8s.gcr.io/sig-storage/csi-attacher:v3.2.1
name: csi-attacher
volumeMounts:
- mountPath: /run/csi
name: socket-dir
- image: k8s.gcr.io/sig-storage/csi-resizer:v1.2.0
name: csi-resizer
volumeMounts:
- mountPath: /run/csi
name: socket-dir
- args:
- --feature-gates=Topology=true
- --default-fstype=ext4
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.2
name: csi-provisioner
volumeMounts:
- mountPath: /run/csi
name: socket-dir
- command:
- /bin/hcloud-csi-driver-controller
env:
- name: CSI_ENDPOINT
value: unix:///run/csi/socket
- name: METRICS_ENDPOINT
value: 0.0.0.0:9189
- name: ENABLE_METRICS
value: "true"
- name: KUBE_NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: HCLOUD_TOKEN
valueFrom:
secretKeyRef:
key: token
name: hcloud
image: hetznercloud/hcloud-csi-driver:2.1.0
imagePullPolicy: Always
livenessProbe:
failureThreshold: 5
httpGet:
path: /healthz
port: healthz
initialDelaySeconds: 10
periodSeconds: 2
timeoutSeconds: 3
name: hcloud-csi-driver
ports:
- containerPort: 9189
name: metrics
- containerPort: 9808
name: healthz
protocol: TCP
volumeMounts:
- mountPath: /run/csi
name: socket-dir
- image: k8s.gcr.io/sig-storage/livenessprobe:v2.3.0
imagePullPolicy: Always
name: liveness-probe
volumeMounts:
- mountPath: /run/csi
name: socket-dir
serviceAccountName: hcloud-csi-controller
volumes:
- emptyDir: {}
name: socket-dir
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: hcloud-csi
name: hcloud-csi-node
namespace: kube-system
spec:
selector:
matchLabels:
app: hcloud-csi
template:
metadata:
labels:
app: hcloud-csi
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: instance.hetzner.cloud/is-root-server
operator: NotIn
values:
- "true"
containers:
- args:
- --kubelet-registration-path=/var/lib/kubelet/plugins/csi.hetzner.cloud/socket
image: k8s.gcr.io/sig-storage/csi-node-driver-registrar:v2.2.0
name: csi-node-driver-registrar
volumeMounts:
- mountPath: /run/csi
name: plugin-dir
- mountPath: /registration
name: registration-dir
- command:
- /bin/hcloud-csi-driver-node
env:
- name: CSI_ENDPOINT
value: unix:///run/csi/socket
- name: METRICS_ENDPOINT
value: 0.0.0.0:9189
- name: ENABLE_METRICS
value: "true"
image: hetznercloud/hcloud-csi-driver:2.1.0
imagePullPolicy: Always
livenessProbe:
failureThreshold: 5
httpGet:
path: /healthz
port: healthz
initialDelaySeconds: 10
periodSeconds: 2
timeoutSeconds: 3
name: hcloud-csi-driver
ports:
- containerPort: 9189
name: metrics
- containerPort: 9808
name: healthz
protocol: TCP
securityContext:
privileged: true
volumeMounts:
- mountPath: /var/lib/kubelet
mountPropagation: Bidirectional
name: kubelet-dir
- mountPath: /run/csi
name: plugin-dir
- mountPath: /dev
name: device-dir
- image: k8s.gcr.io/sig-storage/livenessprobe:v2.3.0
imagePullPolicy: Always
name: liveness-probe
volumeMounts:
- mountPath: /run/csi
name: plugin-dir
tolerations:
- effect: NoExecute
operator: Exists
- effect: NoSchedule
operator: Exists
- key: CriticalAddonsOnly
operator: Exists
volumes:
- hostPath:
path: /var/lib/kubelet
type: Directory
name: kubelet-dir
- hostPath:
path: /var/lib/kubelet/plugins/csi.hetzner.cloud/
type: DirectoryOrCreate
name: plugin-dir
- hostPath:
path: /var/lib/kubelet/plugins_registry/
type: Directory
name: registration-dir
- hostPath:
path: /dev
type: Directory
name: device-dir
---
apiVersion: storage.k8s.io/v1
kind: CSIDriver
metadata:
name: csi.hetzner.cloud
spec:
attachRequired: true
podInfoOnMount: true
volumeLifecycleModes:
- Persistent
fsGroupPolicy: File

View file

@ -7,7 +7,7 @@ After=network-online.target
Type=notify Type=notify
ExecStartPre=-/sbin/modprobe br_netfilter ExecStartPre=-/sbin/modprobe br_netfilter
ExecStartPre=-/sbin/modprobe overlay ExecStartPre=-/sbin/modprobe overlay
ExecStart=/usr/local/bin/k3s agent --server https://{{ master_ip }}:6443 --node-ip {{ k8s_ip}} --token {{ hostvars[groups['control_plane'][0]]['token'] }} {{ extra_agent_args | default("") }} ExecStart=/usr/local/bin/k3s agent --server https://{{ master_ip }}:6443 --node-ip {{ k8s_ip}} --kubelet-arg="provider-id=hcloud://{{ vm_id }}" --token {{ hostvars[groups['control_plane'][0]]['token'] }} {{ extra_agent_args | default("") }}
KillMode=process KillMode=process
Delegate=yes Delegate=yes
# Having non-zero Limit*s causes performance problems due to accounting overhead # Having non-zero Limit*s causes performance problems due to accounting overhead

View file

@ -34,5 +34,5 @@
name: minio name: minio
chart_ref: minio/minio chart_ref: minio/minio
release_namespace: minio release_namespace: minio
chart_version: 5.0.0 chart_version: 5.0.4
release_values: "{{ lookup('ansible.builtin.file', 'values.minio.yaml') | from_yaml }}" release_values: "{{ lookup('ansible.builtin.file', 'values.minio.yaml') | from_yaml }}"

View file

@ -33,5 +33,5 @@
name: prometheus name: prometheus
chart_ref: prometheus-community/kube-prometheus-stack chart_ref: prometheus-community/kube-prometheus-stack
release_namespace: observability-system release_namespace: observability-system
chart_version: 43.2.0 chart_version: 43.2.1
release_values: "{{ lookup('ansible.builtin.template', 'values.yaml.j2') | from_yaml }}" release_values: "{{ lookup('ansible.builtin.template', 'values.yaml.j2') | from_yaml }}"