• add new plugin
    
    slurm.conf:
    JobSubmitPlugins=job_submit/require_timelimit
    
    file in: 
    PluginDir=/usr/lib64/slurm
    
     job_submit_require_timelimit.so
    
    
    
  • Reason=Node unexpectedly rebooted
    after reboot node stas down: 
      Reason=Node unexpectedly rebooted
    
    scontrol update nodename=trestles-9-22 state=idle reason=""
    
    slurm.conf:
    
    ReturnToService=2
    
    
  • topology: switch configuration
    
    [2014-12-17T16:25:55.480] TOPOLOGY: warning -- no switch can reach all nodes through its descendants.Do not 
    use route/topology
    
    
    Three-dimension Topology
    Listing the leaf switches with their nodes 
    
    SwitchName=s1 Nodes=comet-01-[01-72] LinkSpeed=58720256
    SwitchName=s2 Nodes=comet-02-[01-72] LinkSpeed=58720256
    
    no job will span leaf switches without a common parent.
    
    
    but:
    
    SwitchName=s1-01 Level=0 LinkSpeed=1 Nodes=comet-01-[54,56-72] 
    SwitchName=s1-02 Level=0 LinkSpeed=1 Nodes=comet-01-[37-53,55] 
    SwitchName=s1-03 Level=0 LinkSpeed=1 Nodes=comet-01-[18,20-36] 
    SwitchName=s1-04 Level=0 LinkSpeed=1 Nodes=comet-01-[01-17,19] 
    SwitchName=s1 Level=1 LinkSpeed=1 Nodes=comet-01-[01-72] Switches=s1-[01-04] 
    
    
    
    
  • layout
    [2014-12-17T16:25:55.335] layouts: no layout to initialize
    [2014-12-17T16:26:39.774] layouts: loading entities/relations information
    
    
    
    
  • node name configuration
    
    after error: find_node_record: lookup failure for comet-01-01
    
    
    # sinfo
    PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
    compute*     up   infinite      1   idle hpc-0-5
    compute*     up   infinite   1320   down comet-01-[10-64],comet-02-[10-64],comet-03-[10-64].....
    compute*     up   infinite    482    unk comet-01-[01-09,65-72],comet-02-[01-09,65-72].....
    
    
    remove all files in /var/slurm/slurm.state
    
    PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
    compute*     up   infinite   1802    unk comet-01-[01-72],comet-02-[01-72].....
    gpu          up   infinite    146    unk comet-28-[01-72],comet-29-[01-72]
    bigmem       up   infinite     40    unk comet-30-[01-20],comet-31-[01-20]
    
    
    
  • sacct
    default start time is "today 00:00:00"
    
    to display previous records use the -S/-E flags
    
    sacct man pages:   -S, --starttime
    Valid time formats are...
    
                     HH:MM[:SS] [AM|PM]
                     MMDD[YY] or MM/DD[/YY] or MM.DD[.YY]
                     MM/DD[/YY]-HH:MM[:SS]
                     YYYY-MM-DD[THH:MM[:SS]]
                  
                  
                  
    sacct -S 12/24  --allocations -o "JobId,Start,End,State,User,Group,Account,JobName,Partition,\
        Submit,Eligible,ReqMem,NodeList,NNodes,TimeLimit,DerivedExitCode,\
        ExitCode,CPUTime,MaxPages,MaxVMSize,Elapsed"
                  
    
    
    jobs displayed start on 12/21
           
           JobID               Start                 End      State      User     Group    Account    JobName  
    ------------ ------------------- ------------------- ---------- --------- --------- ---------- ---------- 
    2            2014-11-21T14:42:53 2014-11-21T16:43:07    TIMEOUT      jane     testg       test       test    
    3            2014-11-21T14:42:57 2014-11-21T16:43:07    TIMEOUT      jane     testg       test       test    
    4            2014-11-21T14:42:57 2014-11-21T15:41:39  COMPLETED      jane     testg       test       test    
                  
                  
    
    only if the starttime is specified in the very same format as the
    records does sacct show the correct lines:
    
    sacct -S 2014-11-24   --allocations -o
    "JobId,Start,End,State,User,Group,Account,JobName,Partition,\
        Submit,Eligible,ReqMem,NodeList,NNodes,TimeLimit,DerivedExitCode,\
        ExitCode,CPUTime,MaxPages,MaxVMSize,Elapsed"
        
           
           JobID               Start                 End      State      User     Group    Account    JobName  
    ------------ ------------------- ------------------- ---------- --------- --------- ---------- ---------- 
    10           2014-11-24T16:36:21 2014-11-24T17:35:02  COMPLETED      jane     testg       test       test    
    11           2014-11-24T16:36:21 2014-11-24T18:36:21    TIMEOUT      jane     testg       test       test    
    ....
    
    sacct per job: 
    
    $ sacct -S 2015-10-05 -j 1097598_267 
           JobID    JobName  Partition    Account  AllocCPUS      State ExitCode 
    ------------ ---------- ---------- ---------- ---------- ---------- -------- 
    1097598_267  Frenkel-L+    compute     mia152         24 CANCELLED+      0:0 
    1097598_267+      batch                mia152         24  CANCELLED     0:15 
    
    
    sacct per job array:
    
    $ sacct -S 2015-10-05 -j 1097598
    
           JobID    JobName  Partition    Account  AllocCPUS      State ExitCode 
    ------------ ---------- ---------- ---------- ---------- ---------- -------- 
    1097598_999  Frenkel-L+    compute     mia152         24  COMPLETED      0:0 
    1097598_999+      batch                mia152         24  COMPLETED      0:0 
    1097598_0    Frenkel-L+    compute     mia152         24    TIMEOUT      1:0 
    1097598_0.b+      batch                mia152         24  CANCELLED     0:15 
    1097598_1    Frenkel-L+    compute     mia152         24  COMPLETED      0:0 
    
    
    
    
  • slurm scripts on node:
    /var/spool/slurmd/job00093/slurm_script
    
    
  • add 411 files to "Service Node"
    check for group in the 411 node onfiguration:
    
    # rocks report host config411 hpc-0-5
    file name="/etc/411.conf" perms="0600" owner="root:root"
    master url="http://10.1.10.1:372/411.d/"/               
    appliance: service-node                        
    group: Service_Node
    
    
    add to 411:
    
    mkdir /var/411/groups/Service_Node
    
    
    make groups
    ## Service_Node Group
    
    all: /etc/411.d/Service_Node/etc.slurm.slurmdbd..conf 
    
    /etc/411.d/Service_Node/etc.slurm.slurmdbd..conf:: /var/411/groups/Service_Node/etc/slurm/slurmdbd.conf
            /opt/rocks/sbin/411put --group=Service_Node --chroot=/var/411/groups/Service_Node $?
    
    
    
    make
    
    generates the 411 Wrote: /etc/411.d/Service_Node/etc.slurm.slurmdbd..conf
    
    
    
    
  • account funds
    
    Account - 'comet':Description='comet':Organization='sdsc':Fairshare=1:GrpCPUMins=600
    sacctmgr: Parent - 'comet'
    sacctmgr: User - 'hocks':DefaultAccount='comet':Fairshare=1:QOS='normal'
    sacctmgr: User - 'nicki':DefaultAccount='comet':Fairshare=1:QOS='normal'
    sacctmgr: User - 'tanner':DefaultAccount='comet':Fairshare=1:QOS='normal'
    
    
    show:
    
    # sbank balance statement
    User           Usage |        Account     Usage | Account Limit Available (CPU hrs)
    ---------- --------- + -------------- --------- + ------------- ---------
    
    hocks             38 |          COMET        53 |            10       -43
    nicki              7 |          COMET        53 |            10       -43
    tanner             8 |          COMET        53 |            10       -43
    
    
    
    BUT:
    
    sacctmgr: Account - 'comet':Description='comet':Organization='sdsc':Fairshare=1:GrpCPUMins=600
    sacctmgr: Parent - 'comet'
    sacctmgr: User - 'hocks':DefaultAccount='comet':Fairshare=999:GrpCPUMins=3600:QOS='normal'
    sacctmgr: User - 'nicki':DefaultAccount='comet':Fairshare=1:GrpCPUMins=600
    sacctmgr: User - 'tanner':DefaultAccount='comet':Fairshare=1:GrpCPUMins=3600
    
    
    show:
    
    # sbank balance statement
    User           Usage |        Account     Usage | Account Limit Available (CPU hrs)
    ---------- --------- + -------------- --------- + ------------- ---------
    
    hocks             38 |          COMET        53 |            60         7
    nicki              7 |          COMET        53 |            60         7
    tanner             8 |          COMET        53 |            60         7
    
    
    DO NOT use GrpCPUMins for user!!!!! 
    
    
    
  • sacct: error: Problem talking to the database: Connection refused
    
    /etc/slurm.conf:
    
    change
    AccountingStorageHost=127.0.0.1
    
    to: 
    AccountingStorageHost=hpc-0-5
    
    
    
  • slurm error: authentication: credential expired
    
    munge credential expired ( in munge log as well as slurmctld.log)
    
    --: synchronize clock on node
    
    
    
  • job rejected: invalid feature
    
    
    #SBATCH   lines anywhere in the script are interpreted!!!!!
    
    #!/bin/bash -l
    ....
    #SBATCH -t 1:00:00
    #SBATCH -A comet
    
    
    code
    
    exit
    
    #SBATCH --cpus-per-task=1
    #SBATCH --constraint=gtx680
    
    
    
    
  • job exit code 256 (2:0)
    # sjobexitmod -l 28   
           JobID    Account   NNodes        NodeList      State ExitCode DerivedExitCode        Comment 
    ------------ ---------- -------- --------------- ---------- -------- --------------- -------------- 
    28               (null)        1         hpc-0-6     FAILED      2:0             0:0                
    6                 comet        1         hpc-0-5     FAILED      1:0             0:0                
    
    
    
    sched (slurmctl): job_complete for JobId=28 successful, exit code=512
    slurm (slurmd)  : sending REQUEST_COMPLETE_BATCH_SCRIPT, error:0 status 256
    sched (slurmctl): job_complete for JobId=6 successful, exit code=256
    
    
    Job submitted from frontend: 
    
    (from srun command: ) 
    slurmstepd: couldn't chdir to `/state/partition1/home/hocks': No such file or directory: going to /tmp instead
    
    on node: 
    Could not open stdout file /state/partition1/home/hocks/slurm5.out: No such file or directory
    
    ----: cd /home/hocks and submit job from there
    
    
  • job exit code 256 (1:0)
    /var/log/slurm/slurmctld.log
    [2014-10-09T12:14:44.417] sched: Allocate JobId=23 NodeList=hpc-0-[4-5] #CPUs=2
    [2014-10-09T12:14:44.446] completing job 23 status 256
    [2014-10-09T12:14:44.600] DEBUG: Dump job_resources: nhosts 2 cb 0,8
    [2014-10-09T12:14:44.601] sched: job_complete for JobId=23 successful, exit code=256
    
    
    # sjobexitmod -l 23
           JobID    Account   NNodes        NodeList      State ExitCode DerivedExitCode        Comment 
    ------------ ---------- -------- --------------- ---------- -------- --------------- -------------- 
    23                comet        2     hpc-0-[4-5]     FAILED      1:0             0:0                
    
    # sacct -X -j 23 -o JobID,NNodes,State,ExitCode,DerivedExitcode,Comment 
    
           JobID   NNodes      State ExitCode DerivedExitCode        Comment 
    ------------ -------- ---------- -------- --------------- -------------- 
    23                  2     FAILED      1:0             0:0                
    
    Job submitted from hpcdev !!!!!!! 
    
    
    
  • job exit codes
    
    POSIX compliant:
    Exit codes 129-255 represent jobs terminated by Unix signals.
    % perl -le 'print 271 & 127'
    
    
    137    0:9   exit code 0, signal SIGKILL (-9)
    139    0:11  exit code 0, SEG FAULT (11) 
    256    1:0   submit from wrong machine
    512    2:0   home filesystem not found
    
    
    
  • modify job exit information
    
    
    Modify Comment: sjobexitmod: 
    
    > sjobexitmod -e 49 -r "out of memory" 23
    
     You are not running a supported accounting_storage plugin
    (accounting_storage/filetxt).
    Only 'accounting_storage/slurmdbd' and 'accounting_storage/mysql' are supported.
    
    
    
  • client slurmd failure: Zero Bytes were transmitted
    
    [2014-10-06T15:43:48.907] Gathering cpu frequency information for 8 cpus
    [2014-10-06T15:43:48.908] slurmd version 14.03.7 started
    [2014-10-06T15:43:48.909] slurmd started on Mon, 06 Oct 2014 15:43:48 -0700
    [2014-10-06T15:43:48.909] CPUs=8 Boards=1 Sockets=2 Cores=4 Threads=1 Memory=24151 TmpDisk=39426 Uptime=2018
    [2014-10-06T15:43:48.921] error: slurm_receive_msg: Zero Bytes were transmitted or received
    
    
    munge key permission or owner:
    
    /etc/munge/
    399   399  1024 Oct  6 15:10 munge.key
    
    
    restart munge and slurm: 
    
    service munge restart
    service slurm restart
    
    
  • topology
    scontrol: error: Parsing error at unrecognized key: SwitchName
    
    plugin: /usr/lib64/slurm/topology_tree.so
    
    topology.conf is a seperate file not to be included in slurm.conf
    
    
  • slurm restart
    [root@hpc-0-4 ~]# service slurm stop   
    stopping slurmd:                                           [  OK  ]
    slurmd is stopped
    [root@hpc-0-4 ~]# ps -ef|grep slurm
    root      5922     1  0 13:18 ?        00:00:00 slurmstepd: [7]     
    hocks     5933  5922  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00007/slurm_script
    root      6005     1  0 13:18 ?        00:00:00 slurmstepd: [9]     
    hocks     6009  6005  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00009/slurm_script
    root      6026     1  0 13:18 ?        00:00:00 slurmstepd: [10]    
    hocks     6035  6026  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00010/slurm_script
    root      6056     1  0 13:18 ?        00:00:00 slurmstepd: [11]    
    hocks     6070  6056  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00011/slurm_script
    root      6429  6329  0 13:41 pts/0    00:00:00 grep slurm
    [root@hpc-0-4 ~]# service slurm start
    starting slurmd:                                           [  OK  ]
    [root@hpc-0-4 ~]# ps -ef|grep slu
    root      5922     1  0 13:18 ?        00:00:00 slurmstepd: [7]     
    hocks     5933  5922  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00007/slurm_script
    root      6005     1  0 13:18 ?        00:00:00 slurmstepd: [9]     
    hocks     6009  6005  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00009/slurm_script
    root      6026     1  0 13:18 ?        00:00:00 slurmstepd: [10]    
    hocks     6035  6026  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00010/slurm_script
    root      6056     1  0 13:18 ?        00:00:00 slurmstepd: [11]    
    hocks     6070  6056  0 13:18 ?        00:00:00 /bin/bash -l /var/spool/slurmd/job00011/slurm_script
    root      6445     1  0 13:41 ?        00:00:00 /usr/sbin/slurmd
    root      6453  6329  0 13:41 pts/0    00:00:00 grep slu
    
    
  • slurm command : Zero Bytes were transmitted or received
    # squeue
    squeue: error: slurm_receive_msg: Zero Bytes were transmitted or received
    slurm_load_jobs error: Zero Bytes were transmitted or received
    
    munge not running or mnuge.key file updated without restart
    
    # service munge restart
    
    
    May be needed on server machine as well. 
    
    
  • Unable to contact slurm controller
    
    $ sbatch sbatch 
    sbatch: error: Batch job submission failed: Unable to contact slurm controller (connect failure)
    
    check: scontrol show config | grep ControlAddr
    
    
    
  • Protocol authentication error
    [2014-10-01T12:15:03.841] error: Munge decode failed: Invalid credential
    [2014-10-01T12:15:03.842] error: authentication: Invalid credential 
    [2014-10-01T12:15:03.842] error: slurm_receive_msg: Protocol authentication error
    
    
    munge-devel missing:
    
    yum install munge-devel
    
    
  • slurm bash update
    A) If you update a login node before compute nodes jobs will fail as
    John describes.
    
    B) If you update a compute node when there are jobs queued under the
    previous bash then they will fail when they run there (also cannot find
    modules, even though a prologue of ours sets BASH_ENV to force the env
    vars to get set).
    
    
    Our way to (hopefully safely) upgrade our x86-64 clusters was:
    
    0) Note that our slurmctld runs on the cluster management node which is
    separate to the login nodes and not accessible to users.
    
    1) Kick all the users off the login nodes, update bash, reboot them
    (ours come back with nologin enabled to stop users getting back on
    before we're ready).
    
    2) Set all partitions down to stop new jobs starting
    
    3) Move all compute nodes to an "old" partition
    
    4) Move all queued (pending) jobs to the "old" partition
    
    5) Update bash on any idle nodes and move them back to our "main"
    (default) partition
    
    6) Set an AllowGroups on the "old" partition so users can't submit jobs
    to it by accident.
    
    7) Let users back onto the login nodes.
    
    8) Set partitions back to "up" to start jobs going again.
    
    
    
    
  • Requested node configuration is not available
    sbatch: error: Batch job submission failed: Requested node configuration is not available
    
    check node configuration:
    
    $ scontrol show nodes
    CPUAlloc=0 CPUErr=0 CPUTot=8 CPULoad=0.00 Features=batch
       Gres=(null)
       NodeAddr=10.1.1.251 NodeHostName=hpc-0-6 Version=14.03
       OS=Linux RealMemory=1 AllocMem=0 Sockets=8 Boards=1
    
                ^^^^^^^^^^^^^^^^^^^^^^^^^    no memory!!!
    
    
    compute log shows:
    [2014-10-01T12:25:54.157] Node configuration differs from hardware: CPUs=8:8(hw) Boards=1:1(hw) SocketsPerBoard=8:2(hw) CoresPerSocket=1:4(hw) ThreadsPerCore=1:1(hw)
    
    
    set node configuration in slurm.conf (nodenames.conf): 
    CPUs=8 SocketsPerBoard=2 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=24151 TmpDisk=39426
    
    
    
    
  • Requested node configuration is not available
    
    #SBATCH --nodes=1-1
    #SBATCH --ntasks=2
    #SBATCH --cpus-per-task=1
    
    
    [2014-09-23T14:58:03.833] Job 152 priority: 0.00 + 0.00 + 208.33 + 10.00 + 0.00 - 0 = 218.33
    [2014-09-23T14:58:03.834] cons_res: select_p_job_test: job 152 node_req 1 mode 1
    [2014-09-23T14:58:03.834] cons_res: select_p_job_test: min_n 1 max_n 1 req_n 1 avail_n 2
    [2014-09-23T14:58:03.834] node:hpc-0-4 cpus:8 c:4 s:2 t:1 mem:24151 a_mem:0 state:1
    [2014-09-23T14:58:03.834] gres/gpu: state for hpc-0-4
    [2014-09-23T14:58:03.834]   gres_cnt found:0 configured:0 avail:0 alloc:0
    [2014-09-23T14:58:03.834]   gres_bit_alloc:NULL
    [2014-09-23T14:58:03.834] node:hpc-0-5 cpus:8 c:4 s:2 t:1 mem:24151 a_mem:0 state:0
    [2014-09-23T14:58:03.834] gres/gpu: state for hpc-0-5
    [2014-09-23T14:58:03.834]   gres_cnt found:TBD configured:0 avail:0 alloc:0
    [2014-09-23T14:58:03.834]   gres_bit_alloc:NULL
    [2014-09-23T14:58:03.834] node:hpc-0-6 cpus:8 c:4 s:2 t:1 mem:24151 a_mem:0 state:1
    [2014-09-23T14:58:03.834] gres/gpu: state for hpc-0-6
    [2014-09-23T14:58:03.834]   gres_cnt found:0 configured:0 avail:0 alloc:0
    [2014-09-23T14:58:03.834]   gres_bit_alloc:NULL
    [2014-09-23T14:58:03.834] part:CLUSTER rows:1 pri:1 
    [2014-09-23T14:58:03.834] part:compute rows:4 pri:1 
    [2014-09-23T14:58:03.834]   row0: num_jobs 2: bitmap: 0-7,16-23
    [2014-09-23T14:58:03.834]   row1: num_jobs 0: bitmap: [no row_bitmap]
    [2014-09-23T14:58:03.834]   row2: num_jobs 0: bitmap: [no row_bitmap]
    [2014-09-23T14:58:03.834]   row3: num_jobs 0: bitmap: [no row_bitmap]
    [2014-09-23T14:58:03.834] part:gpu rows:4 pri:1000 
    [2014-09-23T14:58:03.834] part:large rows:4 pri:1 
    [2014-09-23T14:58:03.834] cons_res: cr_job_test: evaluating job 152 on 2 nodes
    [2014-09-23T14:58:03.834] cons_res: _can_job_run_on_node: 8 cpus on hpc-0-4(1), mem 0/24151
    [2014-09-23T14:58:03.834] cons_res: _can_job_run_on_node: 8 cpus on hpc-0-6(1), mem 0/24151
    [2014-09-23T14:58:03.834] cons_res: eval_nodes:0 consec c=8 n=1 b=0 e=0 r=-1
    [2014-09-23T14:58:03.834] cons_res: eval_nodes:1 consec c=8 n=1 b=2 e=2 r=-1
    [2014-09-23T14:58:03.834] cons_res: cr_job_test: test 0 fail: insufficient resources
    
    
    works with 
    #SBATCH --ntasks-per-node=2
    
    
    
  • job pending
    with
    
    JobState=PENDING Reason=AssociationJobLimit
    
    no command to run the job. you can eventually change job priority with
    scontrol update job=JOBID priority=....
    
    
  • job_submit
    
    --
    -- Check for unlimited memory requests
    --
       if job_desc.pn_min_memory == 0 then
          log_info("slurm_job_submit: job from uid %d invalid memory request 
    MaxMemPerNode", job_desc.user_id)
          return 2044 -- signal ESLURM_INVALID_TASK_MEMORY
       end
    
    
  • scontrol update NodeName=hpc-0-5 State=RESUME
    
    node : State=IDLE*   no slurm daemon running
    
    
  • sbank sbatch flags not supported
    
    $ sbank submit --array=1-4 -J Array ./sleepme 86400
    flags:WARN getopt: unrecognized option '--array=1-4'
    getopt: invalid option -- 'J'
     -- 'Array' './sleepme' '86400'
    flags:FATAL unable to parse provided options with getopt.
    
    
  • sbank array jobs counted 1
    
    
    $ sbank submit -s sbatch
    log: Getting balance for hocks
    User           Usage |        Account     Usage | Account Limit Available (CPU hrs)
    ---------- --------- + -------------- --------- + ------------- ---------
    
    hocks *            4 |           TEST         4 |         3,600     3,596
    log: Checking script before submitting
    warn: no account specified in the script, using default: test
    Current balance      =      3,596
    Requested hours      =          1
    Expected balance     =      3,595
    log: sbatch'ing the script
    Submitted batch job 65
    
    
    
    
    
  • sbank no reservations
    [hocks@hpcdev-005 ~]$ sbank submit -s sbatch
    User           Usage |        Account     Usage | Account Limit Available (CPU hrs)
    ---------- --------- + -------------- --------- + ------------- ---------
    
    hocks *            6 |           TEST         6 |         3,600     3,594
    Current balance      =      3,594
    Requested hours      =          1
    Expected balance     =      3,593
    Submitted batch job 70
    
    [hocks@hpcdev-005 ~]$ sbank submit -s sbatch
    log: Getting balance for hocks
    User           Usage |        Account     Usage | Account Limit Available (CPU hrs)
    ---------- --------- + -------------- --------- + ------------- ---------
    
    hocks *            6 |           TEST         6 |         3,600     3,594
    Current balance      =      3,594
    Requested hours      =          1
    Expected balance     =      3,593
    Submitted batch job 71
    
    [hocks@hpcdev-005 ~]$ sbank submit -s sbatch
    User           Usage |        Account     Usage | Account Limit Available (CPU hrs)
    ---------- --------- + -------------- --------- + ------------- ---------
    
    hocks *            6 |           TEST         6 |         3,600     3,594
    Current balance      =      3,594
    Requested hours      =          1
    Expected balance     =      3,593
    Submitted batch job 82
    
    
    
  • scontrol show nodes
    
    
    8 jobs running but not listed
    
    NodeName=hpc-0-4 Arch=x86_64 CoresPerSocket=4
       CPUAlloc=8 CPUErr=0 CPUTot=8 CPULoad=0.93 Features=rack-0,8CPUs
       Gres=(null)
       NodeAddr=10.1.1.253 NodeHostName=hpc-0-4 Version=14.03
       OS=Linux RealMemory=24151 AllocMem=0 Sockets=2 Boards=1
       State=ALLOCATED ThreadsPerCore=1 TmpDisk=39426 Weight=20488104
       BootTime=2014-05-07T15:54:24 SlurmdStartTime=2014-05-07T16:49:43
       CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
       ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
    
    hpc-0-4
        state = busy
        np = 8
        properties = rack-0,8CPUs
        ntype = cluster
        status = rectime=1399584257,state=busy,slurmstate=allocated,size=40372224kb:40372224kb,ncpus=8,boards=1,sockets=
    2,cores=4,threads=1,availmem=24151mb,opsys=linux,arch=x86_64
    
    
    
    $ squeue
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
                  65_1    hpcdev     test    hocks  R       0:05      1 hpc-0-4
                  65_2    hpcdev     test    hocks  R       0:05      1 hpc-0-4
                  65_3    hpcdev     test    hocks  R       0:05      1 hpc-0-4
                  65_4    hpcdev     test    hocks  R       0:05      1 hpc-0-4
                  61_1    hpcdev    Array    hocks  R       3:27      1 hpc-0-4
                  61_2    hpcdev    Array    hocks  R       3:27      1 hpc-0-4
                  61_3    hpcdev    Array    hocks  R       3:27      1 hpc-0-4
                  61_4    hpcdev    Array    hocks  R       3:27      1 hpc-0-4
    
    
    
    slurm jobID:
    
    JobId=61 ArrayJobId=61 ArrayTaskId=1 Name=Array
    JobId=62 ArrayJobId=61 ArrayTaskId=2 Name=Array
    JobId=63 ArrayJobId=61 ArrayTaskId=3 Name=Array
    JobId=64 ArrayJobId=61 ArrayTaskId=4 Name=Array
    
    
    
  • Reason=batch job complete failure
    
    [2014-05-07T15:32:30.110] [38] pam_setcred: Failure setting user credentials
    [2014-05-07T15:32:30.110] [38] error in pam_setup
    [2014-05-07T15:32:30.110] [38] pam_close_session: Cannot make/remove an entry for the specified session
    [2014-05-07T15:32:30.116] [38] job_manager exiting abnormally, rc = 4020
    [2014-05-07T15:32:30.116] [38] sending REQUEST_COMPLETE_BATCH_SCRIPT, error:4020 status -1
    [2014-05-07T15:32:30.241] [38] done with job
    
    
    
  • sbatch: error: Batch job submission failed: More processors requested than permitted
    
    scontrol show partition
    
    PartitionName=batch
       ....
       Nodes=(null) TotalCPUs=0 TotalNodes=0
    
    
    
  • slurm configuration
    
    [2014-05-06T12:08:23.649] error: Node hpc-0-4 appears to have a different slurm.conf than the slurmctld.  This could
     cause issues with communication and functionality.  Please review both files and make sure they are the same.  If t
    his is expected ignore, and set DebugFlags=NO_CONF_HASH in your slurm.conf.
    
    
    
  • partition change
    
    changes in the partition table need a slurm restart! All running jobs will be killed
    
    
  • slurm roll rocks compile
    Makefile to avoid rocks dummy .spec file:
    
    
    # Don't re-import Rules-linux-centos.mk
    __RULES_LINUX_CENTOS_MK = yes
    
    REDHAT.ROOT = $(CURDIR)/../../
    
    -include $(ROCKSROOT)/etc/Rules.mk
    include Rules.mk
    
    ifeq ($(REDHAT.ROOT),)
    REDHAT.ROOT     = /usr/src/redhat
    endif
    ifeq ($(REDHAT.VAR),)
    REDHAT.VAR      = /var
    endif
    
    REDHAT.SOURCES  = $(REDHAT.ROOT)/SOURCES
    REDHAT.SPECS    = $(REDHAT.ROOT)/SPECS
    REDHAT.BUILD    = $(REDHAT.ROOT)/BUILD
    REDHAT.RPMS     = $(REDHAT.ROOT)/RPMS
    REDHAT.SRPMS    = $(REDHAT.ROOT)/SRPMS
    
    ifneq ($(RPM.BUILDROOT),)
    BUILDROOT = $(RPM.BUILDROOT)
    else
    BUILDROOT = $(shell pwd)/$(NAME).buildroot
    endif
    
    HOME    = $(CURDIR)
    
    .PHONY: $(HOME)/.rpmmacros
    $(HOME)/.rpmmacros:
            rm -f $@
            @echo "%_topdir $(REDHAT.ROOT)" > $@
            @echo "%_buildrootdir $(BUILDROOT)" >> $@
            @echo "%buildroot $(BUILDROOT)" >> $@
            @echo "%_var    $(REDHAT.VAR)" >> $@
            @echo "%debug_package   %{nil}" >> $@
    
    rpm: $(HOME)/.rpmmacros
            rpmbuild -ta $(NAME)-$(VERSION).$(TARBALL_POSTFIX)
    
    clean::
            rm -f $(HOME)/.rpmmacros
    
    
    
    version.mk
    
    NAME            = slurm
    VERSION         = 14.03.7
    TARBALL_POSTFIX = tar.bz2