Have a kubernetes cluster running on AWS that was setup many years ago using Kops.
Master node instance had an issue needing replacing, which resulted in a failure to bring up the replacement node due to a missing AMI.
Original AMI was Kops 1.8 with k8s-1.8-debian-jessie-amd64-hvm-ebs-2017-12-02 (ami-06a57e7e)
Have tried multiple different versions but so far unable to bring up a working master.
Secondary issue is the APT repositories from when this cluster was created no longer exist. https://www.docker.com/blog/changes-dockerproject-org-apt-yum-repositories/
Which gives this error during nodeup
813 executor.go:145] No progress made, sleeping before retrying 1 failed task(s)
813 executor.go:103] Tasks: 70 done / 81 total; 1 can run
813 executor.go:178] Executing task "Package/docker-engine": Package: docker-engine
813 package.go:142] Listing installed packages: dpkg-query -f ${db:Status-Abbrev}${Version}\n -W docker-engine
813 package.go:267] Installing package "docker-engine" (dependencies: [])
813 files.go:103] Hash did not match for "/var/cache/nodeup/packages/docker-engine": actual=sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709 vs expected=sha1:19296514610aa2e5efddade5222cafae7894a689
813 http.go:77] Downloading "http://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.13.1-0~debian-stretch_amd64.deb"
813 executor.go:130] error running task "Package/docker-engine" (6m59s remaining to succeed): error doing HTTP fetch of "http://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.13.1-0~debian-stretch_amd64.deb": Get http://apt.dockerproject.org/repo/pool/main/d/docker-engine/docker-engine_1.13.1-0~debian-stretch_amd64.deb: dial tcp: lookup apt.dockerproject.org on 172.20.0.2:53: no such host
813 executor.go:145] No progress made, sleeping before retrying 1 failed task(s)
Have tried later releases of this AMI collection (eg. ami-0d44dd5803abceeb3 k8s-1.16-debian-stretch-amd64-hvm-ebs-2021-02-05) with no success.
Changing version number here sometimes has different results, but no success
NODEUP_URL=https://kubeupv2.s3.amazonaws.com/kops/1.8.0/linux/amd64/nodeup
On some later versions started hitting this error.
got error running nodeup (will retry in 30s): error building loader: unable to found certificate: "ca"
Which was resolved with this suggestion https://github.com/kubernetes/kops/issues/4546 and https://gist.github.com/fredsted/034dbf2f8b1117c37a4add0efeea7029
And then hit this error which couldn't find a solution to.
got error running nodeup (will retry in 30s): error building loader: error finding containerd version
Any ideas on how I can launch a working master node that will resume control of the current cluster? Nodes are still running and the 3s state store is intact, but worried a node restart will bring the whole production site down.
Original userdata script below.
#!/bin/bash
# Copyright 2016 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o nounset
set -o pipefail
NODEUP_URL=https://kubeupv2.s3.amazonaws.com/kops/1.8.0/linux/amd64/nodeup
NODEUP_HASH=
function ensure-install-dir() {
INSTALL_DIR="/var/cache/kubernetes-install"
# On ContainerOS, we install to /var/lib/toolbox install (because of noexec)
if [[ -d /var/lib/toolbox ]]; then
INSTALL_DIR="/var/lib/toolbox/kubernetes-install"
fi
mkdir -p ${INSTALL_DIR}
cd ${INSTALL_DIR}
}
# Retry a download until we get it. Takes a hash and a set of URLs.
#
# $1 is the sha1 of the URL. Can be "" if the sha1 is unknown.
# $2+ are the URLs to download.
download-or-bust() {
local -r hash="$1"
shift 1
urls=( $* )
while true; do
for url in "${urls[@]}"; do
local file="${url##*/}"
rm -f "${file}"
if [[ $(which curl) ]]; then
if ! curl -f --ipv4 -Lo "${file}" --connect-timeout 20 --retry 6 --retry-delay 10 "${url}"; then
echo "== Failed to curl ${url}. Retrying. =="
break
fi
elif [[ $(which wget ) ]]; then
if ! wget --inet4-only -O "${file}" --connect-timeout=20 --tries=6 --wait=10 "${url}"; then
echo "== Failed to wget ${url}. Retrying. =="
break
fi
else
echo "== Could not find curl or wget. Retrying. =="
break
fi
if [[ -n "${hash}" ]] && ! validate-hash "${file}" "${hash}"; then
echo "== Hash validation of ${url} failed. Retrying. =="
else
if [[ -n "${hash}" ]]; then
echo "== Downloaded ${url} (SHA1 = ${hash}) =="
else
echo "== Downloaded ${url} =="
fi
return
fi
done
echo "All downloads failed; sleeping before retrying"
sleep 60
done
}
validate-hash() {
local -r file="$1"
local -r expected="$2"
local actual
actual=$(sha1sum ${file} | awk '{ print $1 }') || true
if [[ "${actual}" != "${expected}" ]]; then
echo "== ${file} corrupted, sha1 ${actual} doesn't match expected ${expected} =="
return 1
fi
}
function split-commas() {
echo $1 | tr "," "\n"
}
function try-download-release() {
# TODO(zmerlynn): Now we REALLY have no excuse not to do the reboot
# optimization.
local -r nodeup_urls=( $(split-commas "${NODEUP_URL}") )
local -r nodeup_filename="${nodeup_urls[0]##*/}"
if [[ -n "${NODEUP_HASH:-}" ]]; then
local -r nodeup_hash="${NODEUP_HASH}"
else
# TODO: Remove?
echo "Downloading sha1 (not found in env)"
download-or-bust "" "${nodeup_urls[@]/%/.sha1}"
local -r nodeup_hash=$(cat "${nodeup_filename}.sha1")
fi
echo "Downloading nodeup (${nodeup_urls[@]})"
download-or-bust "${nodeup_hash}" "${nodeup_urls[@]}"
chmod +x nodeup
}
function download-release() {
# In case of failure checking integrity of release, retry.
until try-download-release; do
sleep 15
echo "Couldn't download release. Retrying..."
done
echo "Running nodeup"
# We can't run in the foreground because of https://github.com/docker/docker/issues/23793
( cd ${INSTALL_DIR}; ./nodeup --install-systemd-unit --conf=${INSTALL_DIR}/kube_env.yaml --v=8 )
}
####################################################################################
/bin/systemd-machine-id-setup || echo "failed to set up ensure machine-id configured"
echo "== nodeup node config starting =="
ensure-install-dir
cat > cluster_spec.yaml << '__EOF_CLUSTER_SPEC'
cloudConfig: null
docker:
bridge: ""
ipMasq: false
ipTables: false
logDriver: json-file
logLevel: warn
logOpt:
- max-size=10m
- max-file=5
storage: overlay,aufs
version: 1.13.1
encryptionConfig: null
kubeAPIServer:
address: 127.0.0.1
admissionControl:
- Initializers
- NamespaceLifecycle
- LimitRanger
- ServiceAccount
- PersistentVolumeLabel
- DefaultStorageClass
- DefaultTolerationSeconds
- NodeRestriction
- Priority
- ResourceQuota
allowPrivileged: true
anonymousAuth: false
apiServerCount: 1
authorizationMode: AlwaysAllow
cloudProvider: aws
etcdServers:
- http://127.0.0.1:4001
etcdServersOverrides:
- /events#http://127.0.0.1:4002
image: gcr.io/google_containers/kube-apiserver:v1.8.4
insecurePort: 8080
kubeletPreferredAddressTypes:
- InternalIP
- Hostname
- ExternalIP
logLevel: 2
requestheaderAllowedNames:
- aggregator
requestheaderExtraHeaderPrefixes:
- X-Remote-Extra-
requestheaderGroupHeaders:
- X-Remote-Group
requestheaderUsernameHeaders:
- X-Remote-User
securePort: 443
serviceClusterIPRange: 100.64.0.0/13
storageBackend: etcd2
kubeControllerManager:
allocateNodeCIDRs: true
attachDetachReconcileSyncPeriod: 1m0s
cloudProvider: aws
clusterCIDR: 100.96.0.0/11
clusterName: myclustername.k8s.local
configureCloudRoutes: true
image: gcr.io/google_containers/kube-controller-manager:v1.8.4
leaderElection:
leaderElect: true
logLevel: 2
useServiceAccountCredentials: true
kubeProxy:
clusterCIDR: 100.96.0.0/11
cpuRequest: 100m
featureGates: null
hostnameOverride: '@aws'
image: gcr.io/google_containers/kube-proxy:v1.8.4
logLevel: 2
kubeScheduler:
image: gcr.io/google_containers/kube-scheduler:v1.8.4
leaderElection:
leaderElect: true
logLevel: 2
kubelet:
allowPrivileged: true
cgroupRoot: /
cloudProvider: aws
clusterDNS: 100.64.0.10
clusterDomain: cluster.local
enableDebuggingHandlers: true
evictionHard: memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%,imagefs.available<10%,imagefs.inodesFree<5%
featureGates:
ExperimentalCriticalPodAnnotation: "true"
hostnameOverride: '@aws'
kubeconfigPath: /var/lib/kubelet/kubeconfig
logLevel: 2
networkPluginMTU: 9001
networkPluginName: kubenet
nonMasqueradeCIDR: 100.64.0.0/10
podInfraContainerImage: gcr.io/google_containers/pause-amd64:3.0
podManifestPath: /etc/kubernetes/manifests
requireKubeconfig: true
masterKubelet:
allowPrivileged: true
cgroupRoot: /
cloudProvider: aws
clusterDNS: 100.64.0.10
clusterDomain: cluster.local
enableDebuggingHandlers: true
evictionHard: memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%,imagefs.available<10%,imagefs.inodesFree<5%
featureGates:
ExperimentalCriticalPodAnnotation: "true"
hostnameOverride: '@aws'
kubeconfigPath: /var/lib/kubelet/kubeconfig
logLevel: 2
networkPluginMTU: 9001
networkPluginName: kubenet
nonMasqueradeCIDR: 100.64.0.0/10
podInfraContainerImage: gcr.io/google_containers/pause-amd64:3.0
podManifestPath: /etc/kubernetes/manifests
registerSchedulable: false
requireKubeconfig: true
__EOF_CLUSTER_SPEC
cat > ig_spec.yaml << '__EOF_IG_SPEC'
kubelet: null
nodeLabels:
kops.k8s.io/instancegroup: master-us-west-2b
taints: null
__EOF_IG_SPEC
cat > kube_env.yaml << '__EOF_KUBE_ENV'
Assets:
- 125993c220d1a9b5b60ad20a867a0e7cda63e64c@https://storage.googleapis.com/kubernetes-release/release/v1.8.4/bin/linux/amd64/kubelet
- 8e2314db816b9b4465c5f713c1152cb0603db15e@https://storage.googleapis.com/kubernetes-release/release/v1.8.4/bin/linux/amd64/kubectl
- 1d9788b0f5420e1a219aad2cb8681823fc515e7c@https://storage.googleapis.com/kubernetes-release/network-plugins/cni-0799f5732f2a11b329d9e3d51b9c8f2e3759f2ff.tar.gz
- f62360d3351bed837ae3ffcdee65e9d57511695a@https://kubeupv2.s3.amazonaws.com/kops/1.8.0/linux/amd64/utils.tar.gz
ClusterName: myclustername.k8s.local
ConfigBase: s3://myclustername-state-store/myclustername.k8s.local
InstanceGroupName: master-us-west-2b
Tags:
- _automatic_upgrades
- _aws
- _kubernetes_master
channels:
- s3://myclustername-state-store/myclustername.k8s.local/addons/bootstrap-channel.yaml
protokubeImage:
hash: 1b972e92520b3cafd576893ae3daeafdd1bc9ffd
name: protokube:1.8.0
source: https://kubeupv2.s3.amazonaws.com/kops/1.8.0/images/protokube.tar.gz
__EOF_KUBE_ENV
download-release
echo "== nodeup node config done =="