Common Operations¶

Quick reference for frequent cluster tasks.

Pods¶

# List all pods across namespaces (show status)
kubectl get pods -A

# Show only non-healthy pods
kubectl get pods -A | grep -Ev 'Running|Completed'

# Logs from a pod (follow)
kubectl logs -n <namespace> <pod> -f

# Logs from previous crashed container
kubectl logs -n <namespace> <pod> --previous

# Exec into a pod
kubectl exec -n <namespace> -it <pod> -- /bin/sh

# Describe pod (events, resource requests, node assignment)
kubectl describe pod -n <namespace> <pod>

# Force-delete a stuck Terminating pod
kubectl delete pod -n <namespace> <pod> --force --grace-period=0

Deployments / StatefulSets¶

# Restart a deployment (rolling restart, no downtime)
kubectl rollout restart deployment -n <namespace> <name>

# Watch rollout progress
kubectl rollout status deployment -n <namespace> <name>

# Scale up / down
kubectl scale deployment -n <namespace> <name> --replicas=1

# Get recent events for a deployment
kubectl describe deployment -n <namespace> <name>

Nodes¶

# List nodes with IPs and roles
kubectl get nodes -o wide

# Check node resource usage
kubectl top nodes

# Drain node before maintenance
kubectl drain <node> --ignore-daemonsets --delete-emptydir-data

# Re-enable scheduling after maintenance
kubectl uncordon <node>

# Describe node (conditions, capacity, pods running)
kubectl describe node <node>

ArgoCD¶

# Force sync an app (from CLI)
argocd app sync <app-name>

# Or trigger via kubectl (sets annotation that causes immediate reconcile)
kubectl annotate application -n argocd <app-name> \
  argocd.argoproj.io/refresh=hard --overwrite

# Check app health
argocd app get <app-name>

# List all apps and their sync status
argocd app list

Via the UI: argocd.zaroz.cloud → click the app → Sync.

Certificates¶

# List all certificates (all namespaces)
kubectl get certificate -A

# Watch a certificate being issued
kubectl get certificate -n <namespace> <name> -w

# Inspect why a cert is stuck
kubectl describe certificate -n <namespace> <name>
kubectl describe certificaterequest -n <namespace>
kubectl describe order -n <namespace>
kubectl describe challenge -n <namespace>

# Check cert-manager logs
kubectl logs -n cert-manager deploy/cert-manager -f

Helm¶

# List installed releases
helm list -A

# Upgrade a release with updated values
helm upgrade -n <namespace> <release> <chart> -f values.yaml

# Check history
helm history -n <namespace> <release>

# Roll back one version
helm rollback -n <namespace> <release>

Secrets¶

# Create a secret from literal values
kubectl create secret generic <name> -n <namespace> \
  --from-literal=key=value

# View decoded secret
kubectl get secret -n <namespace> <name> -o jsonpath='{.data}' \
  | jq 'map_values(@base64d)'

# Update a secret value
kubectl patch secret -n <namespace> <name> \
  -p '{"data":{"key":"'$(echo -n "newvalue" | base64)'"}}'

Namespaces¶

# Create a namespace
kubectl create namespace <name>

# Add a label (e.g. customer namespace)
kubectl label namespace <name> zaroz.cloud/managed=true

# List all namespaces with labels
kubectl get namespaces --show-labels

# Delete a namespace (deletes all resources inside it)
kubectl delete namespace <name>

MetalLB¶

# List IP address pools
kubectl get ipaddresspools -n metallb-system

# See which services have LoadBalancer IPs assigned
kubectl get svc -A --field-selector spec.type=LoadBalancer

# Check MetalLB speaker logs (L2 ARP)
kubectl logs -n metallb-system ds/speaker -f

Calico / Network Policies¶

# List GlobalNetworkPolicies
kubectl get globalnetworkpolicies

# Test network isolation from a customer namespace
kubectl run curl-test --image=curlimages/curl --restart=Never -n <customer-ns> -- \
  curl -s --max-time 3 https://1.1.1.1
kubectl logs curl-test -n <customer-ns>
kubectl delete pod curl-test -n <customer-ns>

# Check WireGuard keys (all nodes should have one)
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.projectcalico\.org/WireguardPublicKey}{"\n"}{end}'

# Calico node status (BGP, WireGuard peers)
kubectl exec -n calico-system ds/calico-node -- calico-node -show-status

Storage (NFS)¶

# List PVCs
kubectl get pvc -A

# List PVs (check reclaim policy and status)
kubectl get pv

# Describe a stuck PVC
kubectl describe pvc -n <namespace> <name>

# Check NFS provisioner logs
kubectl logs -n default deploy/nfs-provisioner-nfs-subdir-external-provisioner -f

Cluster Health¶

# etcd member list
kubectl exec -n kube-system -it etcd-<node> -- \
  etcdctl --cacert /etc/kubernetes/pki/etcd/ca.crt \
          --cert /etc/kubernetes/pki/etcd/peer.crt \
          --key /etc/kubernetes/pki/etcd/peer.key \
          member list

# API server reachability
kubectl cluster-info

# Component status
kubectl get componentstatuses