#!/bin/bash
# NVMe device (e.g., /dev/nvme0n1)
DEVICE=$(lsblk -dn -o NAME | grep nvme)
# Check if the device is specified
if [ -z "$DEVICE" ]; then
echo "UNKNOWN: No NVMe device specified."
exit 3
fi
# Check if the nvme-cli tool is installed
if ! command -v nvme &> /dev/null; then
echo "CRITICAL: nvme-cli is not installed."
exit 2
fi
# Get SMART log
SMART_LOG=$(nvme smart-log /dev/$DEVICE 2>/dev/null)
if [ $? -ne 0 ]; then
echo "CRITICAL: Failed to retrieve SMART log for $DEVICE."
exit 2
fi
# Parse SMART log values
CRITICAL_WARNING=$(echo "$SMART_LOG" | grep "critical_warning" | awk '{print $3}')
PERCENTAGE_USED=$(echo "$SMART_LOG" | grep "percentage_used" | awk '{print $3}' |sed 's/%//g')
TEMPERATURE=$(echo "$SMART_LOG" | grep "temperature" | awk '{print $3}')
MEDIA_ERROR=$(echo "$SMART_LOG" | grep "media_errors" | awk '{print $3}')
NUM_ERR_LOG_ENTRIES=$(echo "$SMART_LOG" | grep "num_err_log_entries" | awk '{print $3}')
# Check conditions
if [ "$CRITICAL_WARNING" -ne 0 ]; then
echo "CRITICAL: Critical warning detected on $DEVICE."
exit 2
elif [ "$PERCENTAGE_USED" -ge 90 ]; then
echo "WARNING: $DEVICE is at $PERCENTAGE_USED% of its lifespan."
exit 1
elif [ "$TEMPERATURE" -ge 70 ]; then
echo "WARNING: $DEVICE temperature is high ($TEMPERATURE¡ÆC)."
exit 1
elif [ "$MEDIA_ERROR" -ne 0 ]; then
echo "CRITICAL: Critical warning detected on $DEVICE. media_error : $MEDIA_ERROR $NUM_ERR_LOG_ENTRIES"
exit 1
elif [ "$NUM_ERR_LOG_ENTRIES" -ne 0 ]; then
echo "CRITICAL: Critical warning detected on $DEVICE. num_err_log_entries : $NUM_ERR_LOG_ENTRIES"
exit 1
else
echo "OK: $DEVICE is healthy (Used: $PERCENTAGE_USED%, Temp: $TEMPERATURE¡ÆC, $MEDIA_ERROR $NUM_ERR_LOG_ENTRIES)."
exit 0
fi