Common Operations¶
Quick reference for frequent cluster tasks.
Pods¶
# List all pods across namespaces (show status)
kubectl get pods -A
# Show only non-healthy pods
kubectl get pods -A | grep -Ev 'Running|Completed'
# Logs from a pod (follow)
kubectl logs -n <namespace> <pod> -f
# Logs from previous crashed container
kubectl logs -n <namespace> <pod> --previous
# Exec into a pod
kubectl exec -n <namespace> -it <pod> -- /bin/sh
# Describe pod (events, resource requests, node assignment)
kubectl describe pod -n <namespace> <pod>
# Force-delete a stuck Terminating pod
kubectl delete pod -n <namespace> <pod> --force --grace-period=0
Deployments / StatefulSets¶
# Restart a deployment (rolling restart, no downtime)
kubectl rollout restart deployment -n <namespace> <name>
# Watch rollout progress
kubectl rollout status deployment -n <namespace> <name>
# Scale up / down
kubectl scale deployment -n <namespace> <name> --replicas=1
# Get recent events for a deployment
kubectl describe deployment -n <namespace> <name>
Nodes¶
# List nodes with IPs and roles
kubectl get nodes -o wide
# Check node resource usage
kubectl top nodes
# Drain node before maintenance
kubectl drain <node> --ignore-daemonsets --delete-emptydir-data
# Re-enable scheduling after maintenance
kubectl uncordon <node>
# Describe node (conditions, capacity, pods running)
kubectl describe node <node>
ArgoCD¶
# Force sync an app (from CLI)
argocd app sync <app-name>
# Or trigger via kubectl (sets annotation that causes immediate reconcile)
kubectl annotate application -n argocd <app-name> \
argocd.argoproj.io/refresh=hard --overwrite
# Check app health
argocd app get <app-name>
# List all apps and their sync status
argocd app list
Via the UI: argocd.zaroz.cloud → click the app → Sync.
Certificates¶
# List all certificates (all namespaces)
kubectl get certificate -A
# Watch a certificate being issued
kubectl get certificate -n <namespace> <name> -w
# Inspect why a cert is stuck
kubectl describe certificate -n <namespace> <name>
kubectl describe certificaterequest -n <namespace>
kubectl describe order -n <namespace>
kubectl describe challenge -n <namespace>
# Check cert-manager logs
kubectl logs -n cert-manager deploy/cert-manager -f
Helm¶
# List installed releases
helm list -A
# Upgrade a release with updated values
helm upgrade -n <namespace> <release> <chart> -f values.yaml
# Check history
helm history -n <namespace> <release>
# Roll back one version
helm rollback -n <namespace> <release>
Secrets¶
# Create a secret from literal values
kubectl create secret generic <name> -n <namespace> \
--from-literal=key=value
# View decoded secret
kubectl get secret -n <namespace> <name> -o jsonpath='{.data}' \
| jq 'map_values(@base64d)'
# Update a secret value
kubectl patch secret -n <namespace> <name> \
-p '{"data":{"key":"'$(echo -n "newvalue" | base64)'"}}'
Namespaces¶
# Create a namespace
kubectl create namespace <name>
# Add a label (e.g. customer namespace)
kubectl label namespace <name> zaroz.cloud/managed=true
# List all namespaces with labels
kubectl get namespaces --show-labels
# Delete a namespace (deletes all resources inside it)
kubectl delete namespace <name>
MetalLB¶
# List IP address pools
kubectl get ipaddresspools -n metallb-system
# See which services have LoadBalancer IPs assigned
kubectl get svc -A --field-selector spec.type=LoadBalancer
# Check MetalLB speaker logs (L2 ARP)
kubectl logs -n metallb-system ds/speaker -f
Calico / Network Policies¶
# List GlobalNetworkPolicies
kubectl get globalnetworkpolicies
# Test network isolation from a customer namespace
kubectl run curl-test --image=curlimages/curl --restart=Never -n <customer-ns> -- \
curl -s --max-time 3 https://1.1.1.1
kubectl logs curl-test -n <customer-ns>
kubectl delete pod curl-test -n <customer-ns>
# Check WireGuard keys (all nodes should have one)
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.projectcalico\.org/WireguardPublicKey}{"\n"}{end}'
# Calico node status (BGP, WireGuard peers)
kubectl exec -n calico-system ds/calico-node -- calico-node -show-status
Storage (NFS)¶
# List PVCs
kubectl get pvc -A
# List PVs (check reclaim policy and status)
kubectl get pv
# Describe a stuck PVC
kubectl describe pvc -n <namespace> <name>
# Check NFS provisioner logs
kubectl logs -n default deploy/nfs-provisioner-nfs-subdir-external-provisioner -f
Cluster Health¶
# etcd member list
kubectl exec -n kube-system -it etcd-<node> -- \
etcdctl --cacert /etc/kubernetes/pki/etcd/ca.crt \
--cert /etc/kubernetes/pki/etcd/peer.crt \
--key /etc/kubernetes/pki/etcd/peer.key \
member list
# API server reachability
kubectl cluster-info
# Component status
kubectl get componentstatuses