[CentOS] NVidia, again

Wed Mar 26 21:17:47 UTC 2014
Paul Heinlein <heinlein at madboa.com>

Mark,

Here's the init script I use on our 3-card CUDA box. In particular, 
note the mknod stuff, which might be at issue in your situation. 
(Sorry about line breaks; you may have to guess in a couple spots.)

----- %< -----
#!/bin/bash
#
# Startup/shutdown script for nVidia CUDA
#
# chkconfig: 345 80 20
# description: Startup/shutdown script for nVidia CUDA
#
# =====================================================

# Source function library.

. /etc/init.d/functions

DRIVER=nvidia
RETVAL=0

# Create /dev nodes for nvidia devices
function createnodes() {
   # Count the number of NVIDIA controllers found.
   N3D=$(/sbin/lspci | grep -i NVIDIA | grep "3D controller" | wc -l)
   NVGA=$(/sbin/lspci | grep -i NVIDIA | grep "VGA compatible controller" | wc -l)

   N=$(expr $N3D + $NVGA - 1)
   for i in $(seq 0 $N); do
     mknod -m 666 /dev/nvidia$i c 195 $i
     RETVAL=$?
     [ "$RETVAL" = 0 ] || exit $RETVAL
   done

   mknod -m 666 /dev/nvidiactl c 195 255
   RETVAL=$?
   [ "$RETVAL" = 0 ] || exit $RETVAL
}

# Remove /dev nodes for nvidia devices
function removenodes() {
   rm -f /dev/nvidia*
}

# Start daemon
function start() {
   echo -n $"Loading $DRIVER kernel module: "
   modprobe $DRIVER && success || failure
   RETVAL=$?
   echo
   [ "$RETVAL" = 0 ] || exit $RETVAL

   echo -n $"Initializing CUDA /dev entries: "
   createnodes && success || failure
   RETVAL=$?
   echo
   [ "$RETVAL" = 0 ] || exit $RETVAL
   # this can fail without stopping the entire script
   echo -n $"Setting persistence mode: "
   /usr/bin/nvidia-smi -pm 1 && success || failure
}

# Stop daemon
function stop() {
   echo -n $"Unloading $DRIVER kernel module: "
   rmmod -f $DRIVER && success || failure
   RETVAL=$?
   echo
   [ "$RETVAL" = 0 ] || exit $RETVAL

   echo -n $"Removing CUDA /dev entries: "
   removenodes && success || failure
   RETVAL=$?
   echo
   [ "$RETVAL" = 0 ] || exit $RETVAL
}

# See how we were called
case "$1" in
   start)
     start
     ;;
   stop)
     stop
     ;;
   restart)
     stop
     start
     ;;
   *)
     echo $"Usage: $0 {start|stop|restart}"
     RETVAL=1
esac
exit $RETVAL
----- %< -----

-- 
Paul Heinlein
heinlein at madboa.com
45°38' N, 122°6' W