events: Delete Node on instance termination
dustin/dynk8s-provisioner/pipeline/head This commit looks good Details

The Cluster Autoscaler does not delete the Node resource in Kubernetes
after it terminates an instance:

> It does not delete the Node object from Kubernetes. Cleaning up Node
> objects corresponding to terminated instances is the responsibility of
> the cloud node controller, which can run as part of
> kube-controller-manager or cloud-controller-manager.

On-premises clusters are probably not running the Cloud Controller
Manager, so Node resources are liable to be left behind after a
scale-down event.

To keep unused Node resources from accumulating, the
*dynk8s-provisioner* will now delete the Node resource associated with
an EC2 instance when it receives a state-change event indicating the
instance has been terminated.  To identify the correct Node, it compares
the value of the `providerID` field of each existing node with the
instance ID mentioned in the event.  An exact match is not possible,
since the provider ID includes the availability zone of the instance,
which is not included in the event, however, instances IDs are unique
enough that this "should" never be an issue.
master
Dustin 2022-10-11 20:00:24 -05:00
parent d85f314a8b
commit cd920418aa
2 changed files with 34 additions and 2 deletions

View File

@ -7,7 +7,7 @@ use log::{debug, error};
use crate::k8s::{ use crate::k8s::{
assign_wireguard_config, create_bootstrap_token, delete_bootstrap_tokens, assign_wireguard_config, create_bootstrap_token, delete_bootstrap_tokens,
unassign_wireguard_config, delete_node, unassign_wireguard_config,
}; };
use crate::model::events::*; use crate::model::events::*;
@ -24,6 +24,7 @@ use crate::model::events::*;
/// When an instance is terminated: /// When an instance is terminated:
/// 1. Any WireGuard configs assigned to the instance are unassigned /// 1. Any WireGuard configs assigned to the instance are unassigned
/// 2. All bootstrap tokens for the instance are deleted /// 2. All bootstrap tokens for the instance are deleted
/// 3. The Kubernetes Node resource for the instance is deleted
pub async fn on_ec2_instance_state_change(evt: Ec2InstanceStateChange) { pub async fn on_ec2_instance_state_change(evt: Ec2InstanceStateChange) {
debug!("EC2 instance {} is now {}", &evt.instance_id, &evt.state); debug!("EC2 instance {} is now {}", &evt.instance_id, &evt.state);
if evt.state == "running" { if evt.state == "running" {
@ -53,5 +54,11 @@ pub async fn on_ec2_instance_state_change(evt: Ec2InstanceStateChange) {
&evt.instance_id, e &evt.instance_id, e
); );
} }
if let Err(e) = delete_node(&evt.instance_id).await {
error!(
"Failed to delete node for instance {}: {}",
&evt.instance_id, e
);
}
} }
} }

View File

@ -3,7 +3,7 @@ use std::collections::btree_map::BTreeMap;
use chrono::offset::Utc; use chrono::offset::Utc;
use chrono::{DateTime, Duration}; use chrono::{DateTime, Duration};
use k8s_openapi::api::core::v1::{ConfigMap, Secret}; use k8s_openapi::api::core::v1::{ConfigMap, Node, Secret};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use kube::core::params::{ListParams, Patch, PatchParams, PostParams}; use kube::core::params::{ListParams, Patch, PatchParams, PostParams};
use kube::{Api, Client}; use kube::{Api, Client};
@ -385,6 +385,31 @@ pub async fn get_kubeconfig<I: AsRef<str>>(
} }
} }
/// Delete the node representing an EC2 instance
///
/// When an EC2 node is terminated, it is permanently offline. If the instance
/// was a member of the cluster, it may have a Node resource still present in
/// Kubernetes. This object needs to be deleted; neither the Cluster
/// Autoscaler nor Kubernetes itself will do this.
pub async fn delete_node<I: AsRef<str>>(
instance_id: I,
) -> Result<(), kube::Error> {
let instance_id = instance_id.as_ref();
let client = Client::try_default().await?;
let nodes: Api<Node> = Api::all(client);
for node in nodes.list(&Default::default()).await? {
if let (Some(name), Some(spec)) = (node.metadata.name, node.spec) {
if let Some(pid) = spec.provider_id {
if pid.starts_with("aws:///") && pid.ends_with(instance_id) {
info!("Deleting node {}", &name);
nodes.delete(&name, &Default::default()).await?;
}
}
}
}
Ok(())
}
/// Retrieve the bootstrap token assigned to an EC2 instance /// Retrieve the bootstrap token assigned to an EC2 instance
async fn get_bootstrap_token<I: AsRef<str>>( async fn get_bootstrap_token<I: AsRef<str>>(
instance_id: I, instance_id: I,