Upload
ngotruc
View
215
Download
0
Embed Size (px)
Citation preview
BENCHMARK RECIPE
1
Deploying GATK Best Practices Pipeline Below are the scripts files accompanying and as documented in the Infrastructure for
Deploying GATK Best Practices Pipeline paper.
____________________________________________________________________________
GATK Best Practices Pipeline_README
The following scripts are designed to take the same arguments to keep it consistent for running tests. For
single threaded baseline analysis with no optimizations, run the data_colletion_gatk_best_practices_pl
script with NumThreads as “1”.
For both thread level and process level parallelism analysis, use the
data_collection_gatk_best_practices_optimized.pl script with NumThreads as proposed in the paper.
____________________________________________________________________________________________
Data Collection Script for GATK Best Practices Pipeline (wgs_end2end_data_collection_gatk_best_practices.pl)
#!/usr/bin/perl
if (scalar(@ARGV) < 5) {
die("Usage: SampleName NumThreads InputDataDirectory TempOutputDirectory profiling \n[if profiling is enabled, then the
following is required]: collectstatspath interval stats \n");
}
my $sample = $ARGV[0];
my $numThreads = $ARGV[1];
my $inDataDir = $ARGV[2];
my $tmpDir = "".$ARGV[3];
my $profiling = $ARGV[4]; #by default profiling is turned ON if invoked from the workflow profiler
# arguments for collect_stats
my $collectstatspath = $ARGV[5];
my $interval = $ARGV[6]; # by default sampling interval is 30s from the workflow profiler.
my $stats = $ARGV[7];
#my $numLanes =$ARGV[1];
my $called = "$0 @ARGV";
my $numLanes = 0;
my $sampleprefix = $sample.'_'.$numThreads.'T';
BENCHMARK RECIPE
2
Data Collection Script for GATK Best Practices Pipeline (cont)
# INPUT FASTQ FILES
#OTHER FORMATS FOR FQ: "_1.fastq.gz; #"_1.fastq";
my $fqFile1, $fqFile2;
$fqFile1 = $inDataDir.$sample."_1.fq";
$fqFile2 = $inDataDir.$sample."_2.fq";
# Pipeline executables and its directories
### SPECIFY PATH IN THE VARIABLES BELOW ###
my $toolsDir = '/PATH/TO/TOOLS_DIR';
my $homosapiensrefgenomeDir = '/PATH/TO/REF';
# TOOLS
my $bwaDir = "$toolsDir/bwa";
my $bwa = "$bwaDir/bwa";
my $gatkDir = "$toolsDir/gatk-protected/target";
my $gatk = "$gatkDir/GenomeAnalysisTK.jar";
my $picardDir ="$toolsDir/picard/dist";
my $picard = "$picardDir/picard.jar";
# HOMOSAPIENSREFGENOME
my $refgenomeFastaFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta";
my $refgenomeBwtFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta.bwt";
my $dbSNPvcf = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.dbsnp.vcf";
my $dbSNPindel = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.known_indels.vcf";
# EXOME TARGET INTERVALS
my $exome_targets_intervals = "$homosapiensrefgenomeDir/nexterarapidcapture_exome_uniqueintervals.bed";
unless(-d $inDataDir) {
die("Error: The InputDataDirectory $inDataDir doesn't exist\n");
}
unless(-d $tmpDir) {
die("Error: The TempOutputDirectory $tmpDir doesn't exist\n");
}
# Output file names for each stage of the pipeline
my $baseName = $sample;
my $baseNameLane = $baseName.'_'.$numLanes.'L_'.$numThreads.'T';
my $bwamem_samFile = $tmpDir.$baseNameLane.".sam";
my $sort_bamFile = $tmpDir.$baseName."_sorted.bam";
my $duplicateMetricsFile = $tmpDir.$baseName."_dup.metrics";
my $bamDupRemFile = $tmpDir.$baseName."_dupRem.bam";
my $bamRealignFile = $tmpDir.$baseName."_realign.bam";
my $realnInterval = $tmpDir.$baseName."_realn.intervals";
my $finalBam = $tmpDir.$baseName."_final.bam";
my $HCvcf = $tmpDir.$baseName."_HaplotypeCaller.vcf";
my $genomeImportFile = $refgenomeFastaFile.".fai";
#ADD the relevant platform
my $readGroupHeader = "\@RG\\tID:$baseNameLane\\tLB:$baseName\\tSM:$baseName\\tPL:PLATFORM";
my $recalOut = $tmpDir .$baseName."_recal.grp";
my $dryRun = 0;
my $pwd = `pwd`;
chomp $pwd;
my $host = `hostname`;
chomp $host;
my $uname = `whoami`;
chomp $uname;
my $runningTime = time;
my $commandsfile = $tmpDir.$uname."_".$sampleprefix."_processing.log";
my $outputfile = $tmpDir.$uname."_".$sampleprefix."_output.log";
open(LOG,">$commandsfile");
BENCHMARK RECIPE
3
Data Collection Script for GATK Best Practices Pipeline (cont)
print LOG "#$called (version $version) in $pwd on $host.\n";
print LOG "#Started at ".`date +"%F %T"`."\n";
print LOG "#temporary files created in $tmpDir\n";
my $procTime = time;
my $procFlag = 0;
sub run_and_log {
my $command = $_[0];
my $execute = !$dryRun;
my $exitValue = 0; #a command we don't run is considered successful
my $redirect;
if (@_ >1){
$execute=!$_[1];
}
#several of the programs like to output to STDERR, so we link that to log file
$redirect = "1>>$outputfile 2>&1";
#that is because we redirect STDOUT in many cases, so let's not mess with it.
$redirect = "2>>$outputfile" if $command =~ m/>/;
# $redirect = "" if $command =~ m/>/;
$command = $command." ".$redirect;
if ($procFlag == 0){
$procFlag++;
} else {
$procTime = time - $procTime;
printf LOG "#Processing Time %02d:%02d:%02d\n",int($procTime /3600),int(($procTime % 3600)
/60),int($procTime %60);
$procTime = time;
}
print LOG "#not run\n" if !$execute;
print LOG "#".`date +"%F %T"`;
print LOG $command."\n";
$exitValue = system($command) if $execute;
#necessary if we use `` instead of system()
#$exitValue = $? >>8;
##If the command failed, we want to stop it here.
if ($exitValue != 0){
my $error = "Command failed with return value $exitValue : $command \n";
print LOG $error;
close LOG;
die $error;
}
}
sub Start_profiling {
my ($tag) = @_;
if ($profiling) {
system("$collectstatspath $stats -d $interval -td $tmpDir -n $sampleprefix -tag $tag -l 5 -u 1 -s 600 &");
}
}
sub Stop_Profiling {
BENCHMARK RECIPE
4
Data Collection Script for GATK Best Practices Pipeline (cont)
if ($profiling) {
system("$collectstatspath --kill-all");
}
}
my $stage_tag=BwaMem;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "$bwa mem -t $numThreads -Ma -R \'$readGroupHeader\' $refgenomeFastaFile $fqFile1 $fqFile2 >
$bwamem_samFile";
Stop_Profiling();
sleep(60);
my $stage_tag=SortSam;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx8g -jar $picard SortSam I=$bwamem_samFile O=$sort_bamFile SO=coordinate
CREATE_INDEX=true";
Stop_Profiling();
sleep(60);
my $stage_tag=MarkDuplicates;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx8g -jar $picard MarkDuplicates I=$sort_bamFile O=$bamDupRemFile
M=$duplicateMetricsFile CREATE_INDEX=true TMP_DIR=$tmpDir";
Stop_Profiling();
sleep(60);
my $stage_tag=RealignerTargetCreator;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx8g -jar $gatk -T RealignerTargetCreator -nt $numThreads -R $refgenomeFastaFile -o
$realnInterval -known:indels,vcf $dbSNPindel -I $bamDupRemFile";
Stop_Profiling();
sleep(60);
my $stage_tag=IndelRealigner;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx8g -Djava.io.tmpdir=$tmpDir -jar $gatk -T IndelRealigner -R $refgenomeFastaFile -
targetIntervals $realnInterval -known:indels,vcf $dbSNPindel -I $bamDupRemFile -o $bamRealignFile --
filter_bases_not_stored";
Stop_Profiling();
sleep(60);
my $stage_tag=BaseRecalibrator;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx4g -jar $gatk -T BaseRecalibrator -I $bamRealignFile -R $refgenomeFastaFile -
knownSites:mask,vcf $dbSNPvcf -o $recalOut";
Stop_Profiling();
sleep(60);
BENCHMARK RECIPE
5
Stages of GATK Best Practices Pipeline (cont)
my $stage_tag=PrintReads;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx8g -jar $gatk -T PrintReads -R $refgenomeFastaFile -I $bamRealignFile -BQSR $recalOut
-o $finalBam";
Stop_Profiling();
sleep(60);
my $stage_tag=HaplotypeCaller;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Xmx8g -jar $gatk -T HaplotypeCaller -R $refgenomeFastaFile -I $finalBam -o $HCvcf -ERC
GVCF --variant_index_type LINEAR --variant_index_parameter 128000";
Stop_Profiling();
sleep(60);
$runningTime = time - $runningTime;
printf LOG "#done in %02d:%02d:%02d\n",int($runningTime /3600),int(($runningTime % 3600)
/60),int($runningTime %60);
exit 0;
Data Collection Script for GATK Best Practices Pipeline Optimized (wgs_end2end_data_collection_gatk_best_practices_optimized.pl)
#!/usr/bin/perl
if (scalar(@ARGV) < 5) {
die("Usage: SampleName NumThreads InputDataDirectory TempOutputDirectory profiling \n[if profiling is
enabled, then the following is required]: collectstatspath interval stats \n");
}
my $sample = $ARGV[0];
my $numThreads = $ARGV[1];
my $inDataDir = $ARGV[2];
my $tmpDir = "".$ARGV[3];
my $profiling = $ARGV[4]; #by default profiling is turned ON if invoked from the workflow profiler
# arguments for collect_stats
my $collectstatspath = $ARGV[5];
my $interval = $ARGV[6]; # by default sampling interval is 30s from the workflow profiler.
my $stats = $ARGV[7];
#my $numLanes =$ARGV[1];
my $called = "$0 @ARGV";
my $numLanes = 0;
my $sampleprefix = $sample.'_'.$numThreads.'T';
BENCHMARK RECIPE
6
Data Collection Script for GATK Best Practices Pipeline Optimized (cont)
# INPUT FASTQ FILES
#OTHER FORMATS FOR FQ: "_1.fastq.gz; #"_1.fastq";
my $fqFile1, $fqFile2;
$fqFile1 = $inDataDir.$sample."_1.fq";
$fqFile2 = $inDataDir.$sample."_2.fq";
# Pipeline executables and its directories
### SPECIFY PATH IN THE VARIABLES BELOW ###
my $toolsDir = '/PATH/TO/TOOLS_DIR';
#SCALA_SCRIPTS: Scripts to be passed to Queue.jar
my $QueueBroadBestPracticesDir = '/PATH/TO/SCALA_SCRIPTS';
my $homosapiensrefgenomeDir = '/PATH/TO/REF';
# TOOLS
my $bwaDir = "$toolsDir/bwa";
my $bwa = "$bwaDir/bwa";
my $gatkDir = "$toolsDir/gatk-protected/target";
my $gatk = "$gatkDir/GenomeAnalysisTK.jar";
my $gatk_queue = "$QueueBroadBestPracticesDir/Queue.jar";
my $picardDir ="$toolsDir/picard/dist";
my $picard = "$picardDir/picard.jar";
# HOMOSAPIENSREFGENOME
my $refgenomeFastaFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta";
my $refgenomeBwtFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta.bwt";
my $dbSNPvcf = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.dbsnp.vcf";
my $dbSNPindel = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.known_indels.vcf";
# QUEUE
my $nfs_ExampleIndelRealigner = "$QueueBroadBestPracticesDir/ExampleIndelRealigner.scala";
my $nfs_ExampleBaseRecalibrator = "$QueueBroadBestPracticesDir/ExampleBaseRecalibrator.scala";
my $nfs_ExamplePrintReads = "$QueueBroadBestPracticesDir/ExamplePrintReads.scala";
my $nfs_ExampleHaplotypeCaller = "$QueueBroadBestPracticesDir/ExampleHaplotypeCaller.scala";
# REMOVE QUEUE RELATED FOLDERS AFTER COMPLETION
my $cwd = `cwd`;
print "$cwd\n";
my $queueDir = $cwd . "/.queue";
print "$queueDir\n";
my $jobReport = $cwd . "*.jobreport.txt";
my $ExampleDir = $tmpDir . "/Example*";
unless(-d $inDataDir) {
die("Error: The InputDataDirectory $inDataDir doesn't exist\n");
}
unless(-d $tmpDir) {
die("Error: The TempOutputDirectory $tmpDir doesn't exist\n");
}
# Output file names for each stage of the pipeline
my $baseName = $sample;
my $baseNameLane = $baseName.'_'.$numLanes.'L_'.$numThreads.'T';
my $bwamem_samFile = $tmpDir.$baseNameLane.".sam";
my $sort_bamFile = $tmpDir.$baseName."_sorted.bam";
my $duplicateMetricsFile = $tmpDir.$baseName."_dup.metrics";
BENCHMARK RECIPE
7
Data Collection Script for GATK Best Practices Pipeline Optimized (cont)
my $bamDupRemFile = $tmpDir.$baseName."_dupRem.bam";
my $bamRealignFile = $tmpDir.$baseName."_realign.bam";
my $realnInterval = $tmpDir.$baseName."_realn.intervals";
my $finalBam = $tmpDir.$baseName."_final.bam";
my $HCvcf = $tmpDir.$baseName."_HaplotypeCaller.vcf";
my $genomeImportFile = $refgenomeFastaFile.".fai";
#ADD the relevant platform
my $readGroupHeader = "\@RG\\tID:$baseNameLane\\tLB:$baseName\\tSM:$baseName\\tPL:PLATFORM";
my $recalOut = $tmpDir .$baseName."_recal.grp";
my $dryRun = 0;
my $pwd = `pwd`;
chomp $pwd;
my $host = `hostname`;
chomp $host;
my $uname = `whoami`;
chomp $uname;
my $runningTime = time;
my $commandsfile = $tmpDir.$uname."_".$sampleprefix."_processing.log";
my $outputfile = $tmpDir.$uname."_".$sampleprefix."_output.log";
open(LOG,">$commandsfile");
print LOG "#$called (version $version) in $pwd on $host.\n";
print LOG "#Started at ".`date +"%F %T"`."\n";
print LOG "#temporary files created in $tmpDir\n";
my $procTime = time;
my $procFlag = 0;
sub run_and_log {
my $command = $_[0];
my $execute = !$dryRun;
my $exitValue = 0; #a command we don't run is considered successful
my $redirect;
if (@_ >1){
$execute=!$_[1];
}
#several of the programs like to output to STDERR, so we link that to log file
$redirect = "1>>$outputfile 2>&1";
#that is because we redirect STDOUT in many cases, so let's not mess with it.
$redirect = "2>>$outputfile" if $command =~ m/>/;
# $redirect = "" if $command =~ m/>/;
$command = $command." ".$redirect;
if ($procFlag == 0){
$procFlag++;
} else {
$procTime = time - $procTime;
printf LOG "#Processing Time %02d:%02d:%02d\n",int($procTime /3600),int(($procTime % 3600)
/60),int($procTime %60);
$procTime = time;
BENCHMARK RECIPE
8
Data Collection Script for GATK Best Practices Pipeline Optimized (cont)
}
print LOG "#not run\n" if !$execute;
print LOG "#".`date +"%F %T"`;
print LOG $command."\n";
$exitValue = system($command) if $execute;
#necessary if we use `` instead of system()
#$exitValue = $? >>8;
##If the command failed, we want to stop it here.
if ($exitValue != 0){
my $error = "Command failed with return value $exitValue : $command \n";
print LOG $error;
close LOG;
die $error;
}
}
sub Start_profiling {
my ($tag) = @_;
if ($profiling) {
system("$collectstatspath $stats -d $interval -td $tmpDir -n $sampleprefix -tag $tag -l 5 -u 1 -s 600 &");
}
}
sub Stop_Profiling {
if ($profiling) {
system("$collectstatspath --kill-all");
}
}
my $stage_tag=BwaMem;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "$bwa mem -t $numThreads -Ma -R \'$readGroupHeader\' $refgenomeFastaFile $fqFile1 $fqFile2 >
$bwamem_samFile";
Stop_Profiling();
sleep(60);
my $stage_tag=SortSam;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Dsamjdk.try_use_intel_deflater=true -jar $picard SortSam I=$bwamem_samFile
O=$sort_bamFile SO=coordinate CREATE_INDEX=true TMP_DIR=$tmpDir";
Stop_Profiling();
sleep(60);
my $stage_tag=MarkDuplicates;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Dsamjdk.try_use_intel_deflater=true -jar $picard MarkDuplicates I=$sort_bamFile
O=$bamDupRemFile M=$duplicateMetricsFile CREATE_INDEX=true TMP_DIR=$tmpDir";
Stop_Profiling();
sleep(60);
BENCHMARK RECIPE
9
Data Collection Script for GATK Best Practices Pipeline Optimized (cont)
my $stage_tag=RealignerTargetCreator;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -jar $gatk -T RealignerTargetCreator -nt $numThreads -R $refgenomeFastaFile -o $realnInterval
-known:indels,vcf $dbSNPindel -I $bamDupRemFile";
Stop_Profiling();
sleep(60);
my $stage_tag=IndelRealigner;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -R $refgenomeFastaFile -I $bamDupRemFile -indels
$dbSNPindel -S $nfs_ExampleIndelRealigner -l DEBUG -run -jobRunner CMPShell" ;
Stop_Profiling();
system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir");
sleep(60);
my $stage_tag=BaseRecalibrator;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -I $bamRealignFile -R $refgenomeFastaFile -D
$dbSNPvcf -S $nfs_ExampleBaseRecalibrator -l DEBUG -run -jobRunner CMPShell" ;
Stop_Profiling();
system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir");
sleep(60);
my $stage_tag=PrintReads;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -R $refgenomeFastaFile -I $bamRealignFile -B
$recalOut -l DEBUG -S $nfs_ExamplePrintReads -run -jobRunner CMPShell" ;
Stop_Profiling();
system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir");
sleep(60);
my $stage_tag=HaplotypeCaller;
Start_profiling($stage_tag);
print "$stage_tag\n";
run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -R $refgenomeFastaFile -I $finalBam -l DEBUG -S
$nfs_ExampleHaplotypeCaller -run -jobRunner CMPShell";
Stop_Profiling();
system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir");
sleep(60);
$runningTime = time - $runningTime;
printf LOG "#done in %02d:%02d:%02d\n",int($runningTime /3600),int(($runningTime % 3600)
/60),int($runningTime %60);
exit 0;
_____
BENCHMARK RECIPE
10
CMPShell_README.scala
Steps to run Queue using CMPShell:
1) The 'cmpshell' directory needs to be created with the CMPShellJobRunner.scala and CMPShellJobManager.scala
and added here : scala/src/org/broadinstitute/sting/queue/engine/cmpshell
2) Queue.jar must be recompiled to use 'CMPShell'
3) Pass 'CMPShell' as the argument to -jobRunner when running Queue.jar.
CMPShell.JobRunner.scala
Package org.broadinstitute.sting.queue.engine.shell
Import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
import java.util.Date
import org.broadinstitute.sting.utils.Utils
import org.broadinstitute.sting.utils.runtime.{ProcessSettings, OutputStreamSettings, ProcessController}
import java.util.concurrent.{Executors, ExecutorService}
import Array._
/**
* Runs jobs one at a time locally
* @param function Command to run.
*/
class CMPShellJobRunner(val function: CommandLineFunction, pool: ExecutorService, creationCount: Int) extends
CommandLineJobRunner {
// Controller on the thread that started the job
private var controller: ProcessController = null
/**
* Runs the function on the local shell.
*/
def start() {
var executionMachineName = Utils.resolveHostname()
val commandSep = Array(";")
val sshCommandLine = Array("ssh", executionMachineName)
val scriptCommandLine = Array("sh", jobScript.getAbsolutePath)
var inputCommandLine = Array[String]()
if (!function.jobEnvironmentNames.isEmpty) {
inputCommandLine = function.jobEnvironmentNames.mkString("").split(" ")
inputCommandLine = concat(inputCommandLine, commandSep);
}
val commandLine = concat(sshCommandLine, inputCommandLine, scriptCommandLine)
val stdoutSettings = new OutputStreamSettings
val stderrSettings = new OutputStreamSettings
val mergeError = (function.jobErrorFile == null)
stdoutSettings.setOutputFile(function.jobOutputFile, true)
if (function.jobErrorFile != null)
stderrSettings.setOutputFile(function.jobErrorFile, true)
if (logger.isDebugEnabled) {
stdoutSettings.printStandard(true)
stderrSettings.printStandard(true)
}
BENCHMARK RECIPE
11
CMPShellJobRunner.scala (cont)
val processSettings = new ProcessSettings(
commandLine, mergeError, function.commandDirectory, null,
null, stdoutSettings, stderrSettings)
updateJobRun(processSettings)
updateStatus(RunnerStatus.RUNNING)
val runIt = new Thread( new Runnable {
def run() {
getRunInfo.exechosts = executionMachineName
getRunInfo.startTime = new Date()
controller = ProcessController.getThreadLocal
val exitStatus = controller.exec(processSettings).getExitValue
getRunInfo.doneTime = new Date()
updateStatus(if (exitStatus == 0) RunnerStatus.DONE else RunnerStatus.FAILED)
}
})
pool.execute( runIt)
}
def updateJobStatus() = {
true
}
/**
* Possibly invoked from a shutdown thread, find and
* stop the controller from the originating thread
*/
def tryStop() {
// Assumes that after being set the job may be
// reassigned but will not be reset back to null
if (controller != null) {
try {
controller.tryDestroy()
} catch {
case e =>
logger.error("Unable to kill shell job: " + function.description)
}
}
}
}
CMPShellJobManager.scala
package org.broadinstitute.sting.queue.engine.shell
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.engine.CommandLineJobManager
BENCHMARK RECIPE
12
CMPShellJobManager.scala (cont)
import java.util.concurrent.{Executors, ExecutorService}
class CMPShellJobManager extends CommandLineJobManager[CMPShellJobRunner] {
protected var pool: ExecutorService = null
var creationCount : Int = 0;
def runnerType = classOf[CMPShellJobRunner]
def create(function: CommandLineFunction) = {
creationCount += 1
new CMPShellJobRunner(function, pool, creationCount)
}
override def init() {
creationCount = 0;
pool = Executors.newFixedThreadPool( 64)
}
override def exit() {
pool.shutdown()
}
override def updateStatus(runners: Set[CMPShellJobRunner]) = {
var updatedRunners = Set.empty[CMPShellJobRunner]
runners.foreach(runner => if (runner.updateJobStatus()) {updatedRunners += runner})
updatedRunners
}
override def tryStop(runners: Set[CMPShellJobRunner]) { runners.foreach(_.tryStop()) }
}
For more information: • Intel Life Sciences code optimizations: www.intel.com/healthcare/optimizecode
• GATK optimizations & reference architecture:
http://www.intel.com/content/www/us/en/healthcare-it/solutions/genomicscode-gatk.html
• GATK Best Practices Guide: https://www.broadinstitute.org/gatk/guide/best-practices.php
Copyright © 2016 Intel Corporation. All rights reserved. Intel, the Intel logo are trademarks of Intel Corporation in the U.S. and other countries. * Other names and brands may be claimed as the property of others. 02/16