#!/usr/local/bin/perl5
# bmonitor v2.12 for LSF 
# Christian Rossi (rossi@loria.fr)
# Centre Charles Hermite/LORIA (http://cch.loria.fr/LSF/bmonitor) - Nancy - France
# License for bmonitor : GNU General Public License
# (http://www.gnu.org/copyleft/gpl.html)
# v2.0  2000/03/20
# v2.01 2000/04/25
# v2.02 2000/04/27
# v2.1  2000/06/05
# v2.11 2000/07/07 new f_nb_proc
# v2.12 2000/09/05 modification of f_nb_proc

my $VERSION = 2.12;

system("clear");

# first display
$first = "true";

while (1)
  {
    
    # counter for the not pending jobs
    $job_count = 0;
    
    # default : bjobs -u all
    if (! @ARGV)
      {
	open(BJOBS,"bjobs -u all  2>&1|");
      } else
	{
	  open(BJOBS,"bjobs @ARGV 2>&1|");
	}
    
    # if no job exit
    $first_ligne=<BJOBS>;
    if ($first_ligne =~ /No.*job found/)
      {
	print "$_";
	exit;
      }
    
    # other possible display
    #print "JOBID USER     STAT  QUEUE     PROC MEM  SWAP CPUTIME RUNTIME RUNLIM  EFF    HOG  EXECHOST  SUBMITTIME\n";
    
    $p_first_ligne ="CPU  MEM  SWAP CPUTIM  JOB   USER    STAT    QUEUE    RUNTIM RUNLIM   EFF    HOG  EXECHOST  SUBMIT_TIME\n";
    $p_second_ligne="-------------------------------------------------------------------------------------------------------\n";
    
    # first time print now, after print in buffer 
    if ($first ne "true") 
      {
	$p_lignes = "$p_first_ligne" . "$p_second_ligne";
      } else
	{
	  print $p_first_ligne;
	  print $p_second_ligne;
	}
    
    @lignes_jobs=<BJOBS>;
    
    # lignes of bjob 
    foreach $ligne (@lignes_jobs)
      {
	
	# work with ligne that begin with a job number 
	if ($ligne =~ /^[0-9][0-9][0-9][0-9]/) 
	  {
	    
	    # decrease time to wait for each not pending job and print it 
	    if (($first ne "true") && ($stat ne PEND))
	      {
		if ($old_job_count < 0) 
		  {
		    $old_job_count = 0;
		  }
		$p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count);
		syswrite(STDOUT,"$p_old_job_count",55);
		$old_job_count = $old_job_count - 1;
	      }
	    
	    ($jobid,$user,$stat,$queue,$from_host,$exec_host)=split(/ +/,$ligne);
	    
	    open(BJOBSL,"bjobs -l $jobid |");
	    @bjobsl_out = <BJOBSL>;
	    
	    if ($stat ne PEND)
	      {
		
		# run bhist if job is not pending
		open(BHIST,"bhist $jobid|");
		@bhist_out = <BHIST>;
		
		$job_count = $job_count + 1;
		
		# job name
		$job_name = $ligne;
		$job_name =~ s/^.{56}//;
		$job_name =~ s/.{13}$//;
		chop($job_name);
		
		# submit time
		$submit_time = $ligne;
		$submit_time =~ s/^.{67}//;
		chop($submit_time);
		
		# number of proc
		$nb_proc = &f_nb_proc($jobid);
		
		# memory and swap 
		@mem_swap = &f_mem_swap($jobid);
		$mem = $mem_swap[0];
		$swap = $mem_swap[1];
		
		# cpu time 
		@total_hour_min = &f_cpu_time($jobid);
		$cpu_time_in_sec = $total_hour_min[0];
		$cpu_time_hour = $total_hour_min[1];
		$cpu_time_min = $total_hour_min[2];
		
		# run time
		@total_hour_min = &f_run_time($jobid);
		$run_time_in_sec = $total_hour_min[0];
		$run_time_hour = $total_hour_min[1];
		$run_time_min = $total_hour_min[2];
		
		# run limit 
		@total_hour_min = &f_run_limit($jobid);
		$run_limit_in_sec = $total_hour_min[0];
		$run_limit_hour = $total_hour_min[1];
		$run_limit_min = $total_hour_min[2];
		
		# eff (100 * cpu_time / (nb_proc * run_time))
		$efficasity = &f_efficasity($jobid);
		
		# hog factor (100 * run_time / total_time)
		$hog_factor = &f_hog_factor($jobid);
		
	      }			# if ne PEND
	    
	    if ($stat eq PEND)
	      {
		
		# job name
		$job_name = $ligne;
		$job_name =~ s/^.{66}//;
		$job_name =~ s/.{13}$//;
		chop($job_name);
		
		# submit time
		$submit_time = $ligne;
		$submit_time =~ s/^.{67}//;
		$submit_time =~ s/.{13}$//;
		chop($submit_time);
		
		# number of proc
		$nb_proc = &f_nb_proc($jobid);
		
		# memory and swap 
		$mem = "";
		$swap = "";
		
		# cpu time
		$cpu_time_hour = "";
		$cpu_time_min =  "";
		
		# run time
		$run_time_hour = "";
		$run_time_min =  "";
		
		# run limit 
		@total_hour_min   = &f_run_limit($jobid);
		$run_limit_in_sec = $total_hour_min[0];
		$run_limit_hour   = $total_hour_min[1];
		$run_limit_min    = $total_hour_min[2];
		
		# eff (cpu time / (nb_proc * run_time))
		$efficasity = "";
		
		# hog factor (100 * run_time / total_time)
		$hog_factor = "";
		
	      }			# if eq PEND
	    
	    # format value with sprintf
	    $p_jobid      = sprintf("%5s",$jobid);
	    $p_user       = sprintf("%-8s",$user);
	    $p_stat       = sprintf("%-5s",$stat);
	    $p_queue      = sprintf("%-10s",$queue);
	    $p_nb_proc    = sprintf("%2s",$nb_proc);
	    $p_mem        = sprintf("%5.0f",$mem);
	    $p_swap       = sprintf("%5.0f",$swap);
	    $p_run_limit  = sprintf("%3s:%02d",$run_limit_hour,$run_limit_min);
	    
	    if ($stat ne PEND) 
	      {
		$p_cpu_time   = sprintf("%3s:%02d",$cpu_time_hour,$cpu_time_min);
		$p_run_time   = sprintf("%3s:%02d",$run_time_hour,$run_time_min);
		$p_exec_host  = sprintf("%-8s",$exec_host);
		$p_efficasity = sprintf("%5.1f%%",$efficasity);
		$p_hog_factor = sprintf("%5.1f%%",$hog_factor);
	      } else
		{
		  $p_cpu_time   = sprintf("%6s");
		  $p_run_time   = sprintf("%6s");
		  $p_exec_host  = sprintf("%8s");
		  $p_efficasity = sprintf("%6s");
		  $p_hog_factor = sprintf("%6s");
		}
	    
	    if ($first ne "true") 
	      {
		$lignes = sprintf("%s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",$p_nb_proc,$p_mem,$p_swap,$p_cpu_time,$p_jobid,$p_user,$p_stat,$p_queue,$p_run_time,$p_run_limit,$p_efficasity,$p_hog_factor,$p_exec_host,$submit_time);
		$p_lignes = "$p_lignes" . "$lignes";
	      } else
		{
		  print "$p_nb_proc $p_mem $p_swap $p_cpu_time $p_jobid $p_user $p_stat $p_queue $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n" 
		} 
	    
	  }			# if begin with job number [0-9]
      }				# foreach ligne of bjob 
    
    if ($first eq "false")
      { 
	# display the job in one time
	system("clear");
	print "$p_lignes";
      }
    
    # next display
    $first = "false";
    
    # print bhosts and lsload
    print "\n";
    system("bhosts");
    print "\n";
    system("lsload");
    print "\n";
    
    # date: 2000/03/21 10:44:46
    $date=`date '+%Y/%m/%d %H:%M:%S'`;
    chop($date);
    
    # user wait for about 50 sec
    $old_job_count = 50;
    if ($job_count <= 50)
      {
	# wait time = sleep time + time to run bjob -l and bhist 
	$sleep_time = 50 - $job_count ;
      }
    
    # print the date and the delay before update
    for ($t=0; $t <= $sleep_time ; $t++)
      {
	$p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count);
	syswrite(STDOUT,"$p_old_job_count",55);
	$old_job_count = $old_job_count - 1;
	sleep(1);
      } 
    
  }				# while 1 

# other possible display
#print "$p_jobid $p_user $p_stat $p_queue $p_nb_proc $p_mem $p_swap $p_cpu_time $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n"

########################################################################################
########################################################################################
## fonctions for bmonitor                                                             ##
########################################################################################
########################################################################################

sub f_efficasity{

  $l_effi = "";
  $l_nb_proc = $nb_proc;
  $l_run_time = $run_time_in_sec;
  $l_cpu_time = $cpu_time_in_sec;
  
  $l_effi = $l_nb_proc * $l_run_time;
  
  if ( $l_effi > 0 ) 
    {
      $l_effi = 100 * $l_cpu_time / $l_effi;
      if ( $l_effi > 999.9 )
	{ $l_effi = 999.9 }
    } else
      {
	$l_effi = 999.9;
      }
  return "$l_effi";
}

######################################
# f_mem_swap
# memory ans swap of the current job 
# use bjob -l $jobid
######################################

sub f_mem_swap
  {
    $l_mem = "";
    $l_swap = "";
    foreach $l_ligne (@bjobsl_out)
      {
	if ($l_ligne =~ /MEM: /)
	  {
	    @l_word=split(/ +/,$l_ligne) ;
	    $l_mem = $l_word[2];
	    $l_mem_unit = $l_word[3];
	    $l_swap = $l_word[5];
	    $l_swap_unit = $l_word[6];
	    last;
	  }
      }
    
    if ($l_mem_unit eq "Kbytes;")
      {
	$l_mem = $l_mem / 1024;
      }
    
    if ($l_swap_unit eq "Kbytes;")
      {
	$l_swap = $l_swap / 1024;
      }
    
    if ($l_swap eq "")
      {
	$l_swap = "0";
      }
    if ($l_mem eq "")
      {
	$l_mem = "0";
      }
    return ($l_mem,$l_swap);
  }

#####################################################
# f_cpu_time
# cpu time of the current job
# $l_cpu_time: total cpu time in second
# $l_hour, $l_min:  hours and minutes of the cpu time
# use bjob -l $jobid
#####################################################

sub f_cpu_time
  {
    $l_cpu_time = "";
    foreach $l_ligne (@bjobsl_out)
      {
	if ($l_ligne =~ /seconds/)
	  {
	    @l_word=split(/ +/,$l_ligne) ;
	    $l_cpu_time=$l_word[6];
	    last;
	  }
	
      }
    
    # conversion in hh:mm
    $l_sec = 0;
    $l_min = 0;
    $l_hour = 0; 
    
    if ( $l_cpu_time >= 60 )
      {
	$l_min = $l_cpu_time / 60;
	$l_min = int($l_min);
	$l_sec = $l_cpu_time - ($l_min * 60) ;
      }
    if ( "$l_min" >= 60 )
      {
	$l_hour = $l_min / 60;
	$l_hour = int($l_hour);
	$l_min = $l_min - ($l_hour * 60);
      }
    
    return ($l_cpu_time,$l_hour,$l_min);
  }

#######################################################
# f_run_time
# run time of the current job
# $l_run_time: total run time in second
# $l_hour, $l_min:  hours and minutes of the run time
# use bhist $jobid
#######################################################

sub f_run_time 
  {
    $l_run_time = "";
    foreach $l_ligne (@bhist_out)
      {
	if ($l_ligne =~ /^[0-9]{3,}/)
	  {
	    # for not modify @bhist_out
	    $l_copy = $l_ligne;
	    # delete job name because space caracter
	    $l_copy =~ s/.{25}//;
	    @l_word = split(/ +/,$l_copy) ;
	    $l_run_time = $l_word[3];
	    last;
	  }
      }
    
    # conversion in hh:mm
    $l_sec = 0;
    $l_min = 0;
    $l_hour = 0; 
    
    if ( $l_run_time >= 60 )
      {
	$l_min = $l_run_time / 60;
	$l_min = int($l_min);
	$l_sec = $l_run_time - ($l_min * 60) ;
      }
    
    if ( "$l_min" >= 60 )
      {
	$l_hour = $l_min / 60;
	$l_hour = int($l_hour);
	$l_min = $l_min - ($l_hour * 60);
      }
    
    return ($l_run_time,$l_hour,$l_min);
  }

###########################################################
# f_run_limit
# run limit of the current job
# $l_run_limit: total limit time in second
# $l_hour, $l_min: hours and minutes of the limit time
# users must ask for a run limit (else it's a cpu limit) 
# use bjob -l $jobid
###########################################################

sub f_run_limit
  {
    $l_run_limit = "";
    foreach $l_ligne (@bjobsl_out)
      {
	if ($l_ligne =~ /[0-9] min of/)
	  {
	    @l_word = split(/ +/,$l_ligne) ;
	    $l_run_limit = $l_word[1];
	    # if cpu limit and run limit  
	    if ($l_word[5] =~ /[0-9]/)
	      {
		$l_run_limit = $l_word[5];
	      }
	    last;
	  }
      }
    
    $l_sec=0;
    $l_min=0;
    $l_hour=0; 
    
    if ( $l_run_limit >= 60 )
      {
	$l_hour = $l_run_limit / 60;
	$l_hour = int($l_hour);
	$l_min = $l_run_limit - ($l_hour * 60);
      } else 
	{
	  $l_hour = 0;
	  $l_min = int($l_run_limit);
	}
    
    return ($l_run_limit,$l_hour,$l_min);
    
  }

########################################################
# f_nb_proc
# number of processors of the job
# search Processors in paragraph Submitted from host
# use bjobs -l $jobid$
##########################################################

sub f_nb_proc
  {
    
    $l_nb_cpu = "";
    $l_paragraph = "";
    $l_lignes_proc = false;
    
    foreach $l_ligne (@bjobsl_out)
      {
	# if no empty ligne and in good paragraph
	if (($l_ligne =~ /[\w\d]/) & ($l_lignes_proc eq true))
	  {
	    # delete space at the begining
	    $l_ligne =~ s/^\s+//g;
            $l_ligne =~ s/,/, /g;
	    # delete RC
	    chop($l_ligne);
            if ($l_ligne =~ /Processors$/) { $l_ligne =  $l_ligne . " "}
 	    # add each ligne of the paragraph
	    $l_paragraph = "$l_paragraph" . "$l_ligne";
	  } 
	# empty ligne: end of the good paragraph and search nb of proc
	elsif (($l_ligne =~ /^$/) & ($l_lignes_proc eq true))
	  { $l_lignes_proc = false; }
	# enter in good paragraph
	elsif (($l_ligne =~ /Submitted from host/) && ($l_lignes_proc eq false))
	  { 
	    chop($l_ligne);
	    $l_paragraph = $l_ligne;
            $l_paragraph =~ s/,/, /g;
	    $l_lignes_proc = true;
	  }			# Submitted from host 
	
      }				# foreach ligne
    
    $l_lignes_proc = false;
    @l_words = split(/ +/,$l_paragraph);
    foreach $l_word (@l_words)
      {
	if ($l_word eq Processors)
	  {
	    $l_nb_cpu = $l_prev_word;
	    last;
	  } 
	elsif ($l_word =~ /Processors/)
	  { 
	    $l_nb_cpu = $l_word;
	    $l_nb_cpu =~ s/Processors//;
	    last;
	  }			# fi $word ~ Processors
	$l_prev_word = $l_word;
      }				# foreach 
    
    if ( ! $l_nb_cpu )
      { 
	$l_nb_cpu = 1;
      }
    return $l_nb_cpu;
    
  }

########################################################
# f_hog_factor
# hog factor = run time / turnaround time
# turnaround time = PEND + PSUSP + RUN + USUSP + SSUSP 
# use bhist $jobid
########################################################

sub f_hog_factor
  {
    $l_total_time = "";
    $l_run_time = "";
    foreach $l_ligne (@bhist_out)
      {
	
	if ($l_ligne =~ /^[0-9]{3,}/)
	  {
	    # for not modify @bhist_out
	    $l_copy = $l_ligne;
	    # delete job name because space caracter
	    $l_copy =~ s/.{25}//;
	    @l_word = split(/ +/,$l_copy);
	    $l_run_time = $l_word[3];
	    $l_total_time = $l_word[7];
	    last;
	  }			# fi
      }				# foreach $l_ligne
    
    if ($l_total_time != 0)
      {
	$l_hog_factor = 100 * $l_run_time / $l_total_time;
if ( $l_hog_factor > 999.9 )
{ $l_hog_factor = 999.9;
}
} else { # $l_total_time = 0
$l_hog_factor = 999.9;
} 

return $l_hog_factor;

}

#
# 
#########################################################################

__END__

=head1 NAME

bmonitor - display information about LSF jobs and hosts

=head1 DESCRIPTION

bmonitor is a perl script to monitor LSF jobs. 
Every minute the script show for each jobs useful informations. 
This script use LSF (Load Sharing Facility).

=head1 README

bmonitor is a perl script to monitor LSF jobs. 
Every minute the script show for each jobs useful informations: 

    CPU : number of processors asked par the users 
    MEM : memory used by the job (MB) 
    SWAP : swap used by the job (MB) 
    CPUTIM : cpu time of the job (hh:mm) 
    JOB : number identification of the job 
    USER : user login 
    STAT : status of the job (PEND, PSUSP, USUSP, SSUSP, RUN) 
    QUEUE : name of the queue 
    RUNTIM : run time, time spend by the job in RUN status (hh:mm) 
    RUNLIM : maximun run time value asked by the user (hh:mm) 
    EFF : cpu time / (run time * number of proc) 
    HOG : run time / total time the job spend in LSF 
    EXECHOST : execution host 
    SUBMIT_TIME : date of the soumission 

The options of bjob can be use with bmonitor.

This perl script is disponible from http://cch.loria.fr/LSF/bmonitor/ 
and use the GNU General Public License. 

For more informations send a mail to Christian.Rossi@loria.fr. 

Centre Charles Hermite/LORIA  - Nancy - France 
http://cch.loria.fr/ 
http://www.loria.fr/

=head1 AUTHOR

Christian rossi <rossi@loria.fr>

=pod OSNAMES

Unix

=pod SCRIPT CATEGORIES

UNIX/System_administration

=cut