Configuration

ganglia

Management Node

/* This configuration is as close to 2.5.x default behavior as possible
The values closely match ./gmond/metric.h definitions in 2.5.x */
globals {
  daemonize = yes
  setuid = yes
  user = ganglia
  debug_level = 0
  max_udp_msg_len = 1472
  mute = no
  deaf = no
  allow_extra_data = yes
  host_dmax = 86400 /*secs. Expires (removes from web interface) hosts in 1 day */
  host_tmax = 20 /*secs */
  cleanup_threshold = 300 /*secs */
  gexec = no
  # By default gmond will use reverse DNS resolution when displaying your hostname
  # Uncommeting following value will override that value.
  # override_hostname = "mywebserver.domain.com"
  # If you are not using multicast this value should be set to something other than 0.
  # Otherwise if you restart aggregator gmond you will get empty graphs. 60 seconds is reasonable
  send_metadata_interval = 60 /*secs */

}

/*
 * The cluster attributes specified will be used as part of the <CLUSTER>
 * tag that will wrap all hosts collected by this instance.
 */
cluster {
  name = "OpenHPC"
  owner = "unspecified"
  latlong = "unspecified"
  url = "unspecified"
}

/* The host section describes attributes of the host, like the location */
host {
  location = "unspecified"
}

/* Feel free to specify as many udp_send_channels as you like.  Gmond
   used to only support having a single channel */
udp_send_channel {
  # bind_hostname = yes # Highly recommended, soon to be default.
                        # This option tells gmond to use a source address
                        # that resolves to the machine's hostname.  Without
                        # this, the metrics may appear to come from any
                        # interface and the DNS names associated with
                        # those IPs will be used to create the RRDs.
  # management node's hostname
  host = mgt
  port = 8649
  ttl = 10
}

/* You can specify as many udp_recv_channels as you like as well. */
udp_recv_channel {
  port = 8649
  retry_bind = true
  # Size of the UDP buffer. If you are handling lots of metrics you really
  # should bump it up to e.g. 10MB or even higher.
  buffer = 10485760
}

/* You can specify as many tcp_accept_channels as you like to share
   an xml description of the state of the cluster */
tcp_accept_channel {
  # Add new bind
  bind =  127.0.0.1
  port = 8649
  # If you want to gzip XML output
  gzip_output = no
}

/* Channel to receive sFlow datagrams */
# udp_recv_channel {
#   port = 6343
# }

/* Optional sFlow settings */
# sflow {
#  udp_port = 6343
#  accept_vm_metrics = yes
#  accept_jvm_metrics = yes
#  multiple_jvm_instances = no
#  accept_http_metrics = yes
#  multiple_http_instances = no
#  accept_memcache_metrics = yes
#  multiple_memcache_instances = no
# }

/* Each metrics module that is referenced by gmond must be specified and
   loaded. If the module has been statically linked with gmond, it does
   not require a load path. However all dynamically loadable modules must
   include a load path. */
modules {
  module {
    name = "core_metrics"
  }
  module {
    name = "cpu_module"
    path = "modcpu.so"
  }
  module {
    name = "disk_module"
    path = "moddisk.so"
  }
  module {
    name = "load_module"
    path = "modload.so"
  }
  module {
    name = "mem_module"
    path = "modmem.so"
  }
  module {
    name = "net_module"
    path = "modnet.so"
  }
  module {
    name = "proc_module"
    path = "modproc.so"
  }
  module {
    name = "sys_module"
    path = "modsys.so"
  }
}

/* The old internal 2.5.x metric array has been replaced by the following
   collection_group directives.  What follows is the default behavior for
   collecting and sending metrics that is as close to 2.5.x behavior as
   possible. */

/* This collection group will cause a heartbeat (or beacon) to be sent every
   20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses
   the age of the running gmond. */
collection_group {
  collect_once = yes
  time_threshold = 20
  metric {
    name = "heartbeat"
  }
}

/* This collection group will send general info about this host*/
collection_group {
  collect_every = 60
  time_threshold = 60
  metric {
    name = "cpu_num"
    title = "CPU Count"
  }
  metric {
    name = "cpu_speed"
    title = "CPU Speed"
  }
  metric {
    name = "mem_total"
    title = "Memory Total"
  }
  metric {
    name = "swap_total"
    title = "Swap Space Total"
  }
  metric {
    name = "boottime"
    title = "Last Boot Time"
  }
  metric {
    name = "machine_type"
    title = "Machine Type"
  }
  metric {
    name = "os_name"
    title = "Operating System"
  }
  metric {
    name = "os_release"
    title = "Operating System Release"
  }
  metric {
    name = "location"
    title = "Location"
  }
}

/* This collection group will send the status of gexecd for this host
   every 300 secs.*/
/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
collection_group {
  collect_once = yes
  time_threshold = 300
  metric {
    name = "gexec"
    title = "Gexec Status"
  }
}

/* This collection group will collect the CPU status info every 20 secs.
   The time threshold is set to 90 seconds.  In honesty, this
   time_threshold could be set significantly higher to reduce
   unneccessary  network chatter. */
collection_group {
  collect_every = 20
  time_threshold = 90
  /* CPU status */
  metric {
    name = "cpu_user"
    value_threshold = "1.0"
    title = "CPU User"
  }
  metric {
    name = "cpu_system"
    value_threshold = "1.0"
    title = "CPU System"
  }
  metric {
    name = "cpu_idle"
    value_threshold = "5.0"
    title = "CPU Idle"
  }
  metric {
    name = "cpu_nice"
    value_threshold = "1.0"
    title = "CPU Nice"
  }
  metric {
    name = "cpu_aidle"
    value_threshold = "5.0"
    title = "CPU aidle"
  }
  metric {
    name = "cpu_wio"
    value_threshold = "1.0"
    title = "CPU wio"
  }
  metric {
    name = "cpu_steal"
    value_threshold = "1.0"
    title = "CPU steal"
  }
  /* The next two metrics are optional if you want more detail...
     ... since they are accounted for in cpu_system.
  metric {
    name = "cpu_intr"
    value_threshold = "1.0"
    title = "CPU intr"
  }
  metric {
    name = "cpu_sintr"
    value_threshold = "1.0"
    title = "CPU sintr"
  }
  */
}

collection_group {
  collect_every = 20
  time_threshold = 90
  /* Load Averages */
  metric {
    name = "load_one"
    value_threshold = "1.0"
    title = "One Minute Load Average"
  }
  metric {
    name = "load_five"
    value_threshold = "1.0"
    title = "Five Minute Load Average"
  }
  metric {
    name = "load_fifteen"
    value_threshold = "1.0"
    title = "Fifteen Minute Load Average"
  }
}

/* This group collects the number of running and total processes */
collection_group {
  collect_every = 80
  time_threshold = 180
  metric {
    name = "proc_run"
    value_threshold = "1.0"
    title = "Total Running Processes"
  }
  metric {
    name = "proc_total"
    value_threshold = "1.0"
    title = "Total Processes"
  }
}

/* This collection group grabs the volatile memory metrics every 40 secs and
   sends them at least every 180 secs.  This time_threshold can be increased
   significantly to reduce unneeded network traffic. */
collection_group {
  collect_every = 40
  time_threshold = 180
  metric {
    name = "mem_free"
    value_threshold = "1024.0"
    title = "Free Memory"
  }
  metric {
    name = "mem_shared"
    value_threshold = "1024.0"
    title = "Shared Memory"
  }
  metric {
    name = "mem_buffers"
    value_threshold = "1024.0"
    title = "Memory Buffers"
  }
  metric {
    name = "mem_cached"
    value_threshold = "1024.0"
    title = "Cached Memory"
  }
  metric {
    name = "swap_free"
    value_threshold = "1024.0"
    title = "Free Swap Space"
  }
}

collection_group {
  collect_every = 40
  time_threshold = 300
  metric {
    name = "bytes_out"
    value_threshold = 4096
    title = "Bytes Sent"
  }
  metric {
    name = "bytes_in"
    value_threshold = 4096
    title = "Bytes Received"
  }
  metric {
    name = "pkts_in"
    value_threshold = 256
    title = "Packets Received"
  }
  metric {
    name = "pkts_out"
    value_threshold = 256
    title = "Packets Sent"
  }
}

/* Different than 2.5.x default since the old config made no sense */
collection_group {
  collect_every = 180
  time_threshold = 180
  metric {
    name = "disk_total"
    value_threshold = 1.0
    title = "Total Disk Space"
  }
}

collection_group {
  collect_every = 40
  time_threshold = 180
  metric {
    name = "disk_free"
    value_threshold = 1.0
    title = "Disk Space Available"
  }
  metric {
    name = "part_max_used"
    value_threshold = 1.0
    title = "Maximum Disk Space Used"
  }
}

include ("/etc/ganglia/conf.d/*.conf")

Other Node

/* This configuration is as close to 2.5.x default behavior as possible
The values closely match ./gmond/metric.h definitions in 2.5.x */
globals {
  daemonize = yes
  setuid = yes
  user = ganglia
  debug_level = 0
  max_udp_msg_len = 1472
  mute = no
  deaf = no
  allow_extra_data = yes
  host_dmax = 86400 /*secs. Expires (removes from web interface) hosts in 1 day */
  host_tmax = 20 /*secs */
  cleanup_threshold = 300 /*secs */
  gexec = no
  # By default gmond will use reverse DNS resolution when displaying your hostname
  # Uncommeting following value will override that value.
  # override_hostname = "mywebserver.domain.com"
  # If you are not using multicast this value should be set to something other than 0.
  # Otherwise if you restart aggregator gmond you will get empty graphs. 60 seconds is reasonable
  send_metadata_interval = 60 /*secs */

}

/*
 * The cluster attributes specified will be used as part of the <CLUSTER>
 * tag that will wrap all hosts collected by this instance.
 */
cluster {
  name = "OpenHPC"
  owner = "unspecified"
  latlong = "unspecified"
  url = "unspecified"
}

/* The host section describes attributes of the host, like the location */
host {
  location = "unspecified"
}

/* Feel free to specify as many udp_send_channels as you like.  Gmond
   used to only support having a single channel */
udp_send_channel {
  # bind_hostname = yes # Highly recommended, soon to be default.
                        # This option tells gmond to use a source address
                        # that resolves to the machine's hostname.  Without
                        # this, the metrics may appear to come from any
                        # interface and the DNS names associated with
                        # those IPs will be used to create the RRDs.
  # management node's hostname
  host = mgt
  port = 8649
  ttl = 10
}

/* You can specify as many udp_recv_channels as you like as well. */
udp_recv_channel {
  port = 8649
  retry_bind = true
  # Size of the UDP buffer. If you are handling lots of metrics you really
  # should bump it up to e.g. 10MB or even higher.
  #buffer = 10485760
}

/* You can specify as many tcp_accept_channels as you like to share
   an xml description of the state of the cluster */
tcp_accept_channel {
  # Add new bind
  bind =  127.0.0.1
  port = 8649
  # If you want to gzip XML output
  gzip_output = no
}

/* Channel to receive sFlow datagrams */
# udp_recv_channel {
#   port = 6343
# }

/* Optional sFlow settings */
# sflow {
#  udp_port = 6343
#  accept_vm_metrics = yes
#  accept_jvm_metrics = yes
#  multiple_jvm_instances = no
#  accept_http_metrics = yes
#  multiple_http_instances = no
#  accept_memcache_metrics = yes
#  multiple_memcache_instances = no
# }

/* Each metrics module that is referenced by gmond must be specified and
   loaded. If the module has been statically linked with gmond, it does
   not require a load path. However all dynamically loadable modules must
   include a load path. */
modules {
  module {
    name = "core_metrics"
  }
  module {
    name = "cpu_module"
    path = "modcpu.so"
  }
  module {
    name = "disk_module"
    path = "moddisk.so"
  }
  module {
    name = "load_module"
    path = "modload.so"
  }
  module {
    name = "mem_module"
    path = "modmem.so"
  }
  module {
    name = "net_module"
    path = "modnet.so"
  }
  module {
    name = "proc_module"
    path = "modproc.so"
  }
  module {
    name = "sys_module"
    path = "modsys.so"
  }
}

/* The old internal 2.5.x metric array has been replaced by the following
   collection_group directives.  What follows is the default behavior for
   collecting and sending metrics that is as close to 2.5.x behavior as
   possible. */

/* This collection group will cause a heartbeat (or beacon) to be sent every
   20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses
   the age of the running gmond. */
collection_group {
  collect_once = yes
  time_threshold = 20
  metric {
    name = "heartbeat"
  }
}

/* This collection group will send general info about this host*/
collection_group {
  collect_every = 60
  time_threshold = 60
  metric {
    name = "cpu_num"
    title = "CPU Count"
  }
  metric {
    name = "cpu_speed"
    title = "CPU Speed"
  }
  metric {
    name = "mem_total"
    title = "Memory Total"
  }
  metric {
    name = "swap_total"
    title = "Swap Space Total"
  }
  metric {
    name = "boottime"
    title = "Last Boot Time"
  }
  metric {
    name = "machine_type"
    title = "Machine Type"
  }
  metric {
    name = "os_name"
    title = "Operating System"
  }
  metric {
    name = "os_release"
    title = "Operating System Release"
  }
  metric {
    name = "location"
    title = "Location"
  }
}

/* This collection group will send the status of gexecd for this host
   every 300 secs.*/
/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
collection_group {
  collect_once = yes
  time_threshold = 300
  metric {
    name = "gexec"
    title = "Gexec Status"
  }
}

/* This collection group will collect the CPU status info every 20 secs.
   The time threshold is set to 90 seconds.  In honesty, this
   time_threshold could be set significantly higher to reduce
   unneccessary  network chatter. */
collection_group {
  collect_every = 20
  time_threshold = 90
  /* CPU status */
  metric {
    name = "cpu_user"
    value_threshold = "1.0"
    title = "CPU User"
  }
  metric {
    name = "cpu_system"
    value_threshold = "1.0"
    title = "CPU System"
  }
  metric {
    name = "cpu_idle"
    value_threshold = "5.0"
    title = "CPU Idle"
  }
  metric {
    name = "cpu_nice"
    value_threshold = "1.0"
    title = "CPU Nice"
  }
  metric {
    name = "cpu_aidle"
    value_threshold = "5.0"
    title = "CPU aidle"
  }
  metric {
    name = "cpu_wio"
    value_threshold = "1.0"
    title = "CPU wio"
  }
  metric {
    name = "cpu_steal"
    value_threshold = "1.0"
    title = "CPU steal"
  }
  /* The next two metrics are optional if you want more detail...
     ... since they are accounted for in cpu_system.
  metric {
    name = "cpu_intr"
    value_threshold = "1.0"
    title = "CPU intr"
  }
  metric {
    name = "cpu_sintr"
    value_threshold = "1.0"
    title = "CPU sintr"
  }
  */
}

collection_group {
  collect_every = 20
  time_threshold = 90
  /* Load Averages */
  metric {
    name = "load_one"
    value_threshold = "1.0"
    title = "One Minute Load Average"
  }
  metric {
    name = "load_five"
    value_threshold = "1.0"
    title = "Five Minute Load Average"
  }
  metric {
    name = "load_fifteen"
    value_threshold = "1.0"
    title = "Fifteen Minute Load Average"
  }
}

/* This group collects the number of running and total processes */
collection_group {
  collect_every = 80
  time_threshold = 180
  metric {
    name = "proc_run"
    value_threshold = "1.0"
    title = "Total Running Processes"
  }
  metric {
    name = "proc_total"
    value_threshold = "1.0"
    title = "Total Processes"
  }
}

/* This collection group grabs the volatile memory metrics every 40 secs and
   sends them at least every 180 secs.  This time_threshold can be increased
   significantly to reduce unneeded network traffic. */
collection_group {
  collect_every = 40
  time_threshold = 180
  metric {
    name = "mem_free"
    value_threshold = "1024.0"
    title = "Free Memory"
  }
  metric {
    name = "mem_shared"
    value_threshold = "1024.0"
    title = "Shared Memory"
  }
  metric {
    name = "mem_buffers"
    value_threshold = "1024.0"
    title = "Memory Buffers"
  }
  metric {
    name = "mem_cached"
    value_threshold = "1024.0"
    title = "Cached Memory"
  }
  metric {
    name = "swap_free"
    value_threshold = "1024.0"
    title = "Free Swap Space"
  }
}

collection_group {
  collect_every = 40
  time_threshold = 300
  metric {
    name = "bytes_out"
    value_threshold = 4096
    title = "Bytes Sent"
  }
  metric {
    name = "bytes_in"
    value_threshold = 4096
    title = "Bytes Received"
  }
  metric {
    name = "pkts_in"
    value_threshold = 256
    title = "Packets Received"
  }
  metric {
    name = "pkts_out"
    value_threshold = 256
    title = "Packets Sent"
  }
}

/* Different than 2.5.x default since the old config made no sense */
collection_group {
  collect_every = 180
  time_threshold = 180
  metric {
    name = "disk_total"
    value_threshold = 1.0
    title = "Total Disk Space"
  }
}

collection_group {
  collect_every = 40
  time_threshold = 180
  metric {
    name = "disk_free"
    value_threshold = 1.0
    title = "Disk Space Available"
  }
  metric {
    name = "part_max_used"
    value_threshold = 1.0
    title = "Maximum Disk Space Used"
  }
}

include ("/etc/ganglia/conf.d/*.conf")

slurm

slurm.conf

#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=mycluster
ControlMachine=c031
GresTypes=gpu,gpu_mem
# ControlAddr=
# BackupController=
# BackupAddr=
#
SlurmUser=slurm
# SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
# JobCredentialPrivateKey=
# JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
# ProctrackType=proctrack/pgid
# PluginDir=
# FirstJobId=
ReturnToService=2
# MaxJobCount=
# PlugStackConfig=
# PropagatePrioProcess=
# PropagateResourceLimits=
# PropagateResourceLimitsExcept=
# Prolog=
# Epilog=
# SrunProlog=
# SrunEpilog=
# TaskProlog=
# TaskEpilog=
ProctrackType=proctrack/pgid
TaskPlugin=task/affinity
# TrackWCKey=no
# TreeWidth=50
# TmpFS=
# UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
# SchedulerAuth=
# SchedulerPort=
# SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_CPU
FastSchedule=1
# PriorityType=priority/multifactor
# PriorityDecayHalfLife=14-0
# PriorityUsageResetPeriod=14-0
# PriorityWeightFairshare=100000
# PriorityWeightAge=1000
# PriorityWeightPartition=10000
# PriorityWeightJobSize=1000
# PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/lib/slurmjobcom
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
# AccountingStorageType=accounting_storage/filetxt
# AccountingStorageHost=
# AccountingStorageLoc=
# AccountingStoragePass=
# AccountingStorageUser=
#
# COMPUTE NODES
# DefMemPerCPU=100
EnforcePartLimits=ALL
NodeName=c031 Gres=gpu:4,gpu_mem:10000 CPUs=28 RealMemory=200000 State=UNKNOWN
NodeName=c032 Gres=gpu:4,gpu_mem:10000 CPUs=28 RealMemory=200000 State=UNKNOWN
PartitionName=compute Nodes=c0[31-32] Default=YES  MaxTime=INFINITE State=UP
PartitionName=compute1 Nodes=c0[31-32] Default=YES  MaxTime=INFINITE State=UP

This file needs to be modified as follows:

  1. Cluster Name

ClusterName=mycluster
  1. Management Node Name

ControlMachine=c031
  1. GPU Scheduling

In the cluster, this entry is used when there is a GPU node on the cluster. If there is no GPU node, delete this entry

GresTypes=gpu,gpu_mem
  1. Cluster Node Definitions

NodeName=c031 Gres=gpu:4,gpu_mem:10000 CPUs=28 RealMemory=200000 State=UNKNOWN
NodeName=c032 Gres=gpu:4,gpu_mem:10000 CPUs=28 RealMemory=200000 State=UNKNOWN
NodeName

Shows the node name

Gres

Shows the number of GPUs

GPU

Graphics memory size in every node (if this is not a GPU node, then delete the Gres content).

CPUs

Shows the number of CPUs in a node

RealMemory

Shows the node memory size (Unit: M)

  1. Partition Definitions:

The user needs to select Partition when submitting the job, and if the user does not specify, use the default Partition.

PartitionName=compute Nodes=c0[31-32] Default=YES  MaxTime=INFINITE State=UP
PartitionName=compute1 Nodes=c0[31-32] Default=NO  MaxTime=INFINITE State=UP
PartitionName

Shows the name of the Partition

Nodes

Shows the nodes in the Partition

Default

Shows if this Partition has a default Partition

  1. EnforcePartLimits Definitions:

If you want to submit a direct error response when a job requests resources and exceeds the cluster resource amount, use the configuration below, or the job will remain in the queue

EnforcePartLimits=ALL
  1. More detail about how to configure slurm.conf, please refer slurm offical site:

gres.conf

Note

  • Copy this file to the /etc/slurm directory on the GPU node, Non-GPU nodes do not need this file

  • The file describes current GPU and GPU memory on GPU nodes. The content of the gres.conf file may vary based on the GPU node

  • The Count attribute of the gpu_mem setting shows the amount GPU memory per GPU (Unit: MB).

Name=gpu File=/dev/nvidia[0-3]
Name=gpu_mem Count=10000