srun -N 1 -n 1 --pty bash -i
.spack load [software]
.Visual Studio Code
, Sublime
.spack load r@4.4.0
.R
in the interactive session to initiate an R session.parallel
and foreach
, see Quick Instructions of Parallel Computation.ggplot2::ggsave("temp.pdf")
.python
in the interactive session to initiate a python session.spack load miniconda3@24.3.0
.conda init; source ~/.bashrc;
for once for using conda
.conda create -n "myenv"
.conda env list
.conda activate ~/.conda/envs/myenv
.conda install -n myenv [package]
.conda deactivate
conda remove -n myenv -all
conda -h
.PLINK is a powerful and useful genetic data analysis tool to convert data format, QC, calculate MAF and HWE p-value, calculate kinship matrix, calculate top Principal Components, conduct single variant genetic association studies. Common data format is BED/BIM/FAM
.
spack load plink2@2.00a4.3
plink2 -h
.spack unload plink2@2.00a4.3
.BCFTools is a fast tool to manipulate sorted/bgzipped/tabixed
VCF files of genotype data.
spack load tabix@2013-12-16
.spack load bcftools@1.19
.bcftools -h
.BEDTools utilities are a Swiss-army knife of tools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
spack load bedtools2@2.31.1
.bedtools -h
.sbatch
.Create an example bash script by touch star_genome_create.sh
and write the following commands into this file (see example shell script in /home/jyang51/yangfss2/public/ExampleScripts/star_genome_create.sh
on HGCC):
#!/bin/bash
#SBATCH --job-name=STAR_genomeGenerate
#SBATCH --nodes=1
#SBATCH --mem=128G ## request memory
#SBATCH --cpus-per-task=16 ## request cpu/cores
#SBATCH --array=1 ## a single job; or remove this line of specifying an array job
#SBATCH --time=24:00:00 ## specify job running time for 24 hrs
#SBATCH --output=./SLURM_OUT/%x_%A_%a.out ## save slurm output
#SBATCH --error=./SLURM_OUT/%x_%A_%a.err ## save slurm errors
spack load miniconda3@24.3.0
conda activate ~/.conda/envs/myenv
### Create genome index
echo Running STAR genomeGenerate ...
STAR --runThreadN 16 --runMode genomeGenerate \
--genomeDir /home/jyang51/yangfss2/public/GenomeIndex/star_indexes/hg38 \
--genomeFastaFiles /home/jyang51/yangfss2/public/GenomeIndex/iGenome/hg38_2021/genome.fa \
--sjdbGTFfile /home/jyang51/yangfss2/public/GenomeIndex/iGenome/hg38_2021/gencode.v46.basic.annotation.gtf \
--sjdbOverhang 150
conda deactivate
exit
STAR
for mapping RNAseq data. echo
to print out log messages or variable contents to debug and check the job status.chmod 755 phase.sh
.
sbatch /home/jyang51/yangfss2/public/ExampleScripts/star_genome_create.sh
/scratch
space for I/O intensive jobsIf one want to use the /scratch
space to improve computation efficiency by avoiding heavy I/O, the bash script need to be updated to include commands creating a temporary directory ${TMPDIR}
under the /scratch
space (84TB shared).
#!/bin/bash
# Generate tmp directory name
TMP_NAME=`/usr/bin/mktemp -u XXXXXXXX`
# Create tmp directory
# TMPDIR="/scratch/${SLURM_JOB_ID}_${TMP_NAME}" # include slurm job id in the name
TMPDIR="/scratch/${TMP_NAME}"
echo $TMPDIR
mkdir -p "$TMPDIR"
# Copy input data files into the temporary directory
rsync /home/jyang51/yangfss2/public/ExampleData/Sample_ID.txt ${TMPDIR}/
# Run the following command under the temporary directory
cd ${TMPDIR}/
paste Sample_ID.txt Sample_ID.txt > output_sample_ID_2.txt
# Copy results back to hard disk
rsync ${TMPDIR}/output_sample_ID_2.txt /home/jyang51/yangfss2/public/ExampleData/
# Remove temporary directory
rm -f -r ${TMPDIR}
exit
rsync
.Run the analysis commands with input data files under the temporary directory.Array jobs is a convenient way to submit multiple repetitive jobs that only differs by one input variables, e.g., 10000 repetitive simulations, association study for all 20K genome-wide genes.
The following example commands to submit an Array job will submit repetitive jobs to run the same bash scripts star_sbatch.sh
for 52
times, with the only difference of slurm job id $SLURM_ARRAY_TASK_ID
for getting the corresponding sample ID.
sbatch /home/jyang51/yangfss2/public/ExampleScripts/star_sbatch.sh /home/jyang51/yangfss2/public/ExampleData
star_sbatch.sh
has the following contents:#!/bin/bash
#SBATCH --job-name=MAP_STAR
#SBATCH --nodes=1
#SBATCH --mem=64G
#SBATCH --cpus-per-task=8
#SBATCH --array=1-52
#SBATCH --time=24:00:00
#SBATCH --output=./SLURM_OUT/%x_%A_%a.out
#SBATCH --error=./SLURM_OUT/%x_%A_%a.err
#### Print the task id.
echo "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID
#### Take output directory from first input argument
output_dir=$1
echo "Output file directory: $output_dir".
## define data directory
data_dir=/home/jyang51/yangfss2/projects/Bill_Li_RNAseq/BC_Biomarker_Normal_Biopsy/LBI13454-118133
# Generate tmp directory name
TMP_NAME=`/usr/bin/mktemp -u XXXXXXXX`
# Create tmp directory
# TMPDIR="/scratch/${SLURM_JOB_ID}_${TMP_NAME}" # include slurm job id in the name
TMPDIR="/scratch/${TMP_NAME}"
echo $TMPDIR
mkdir -p "$TMPDIR"
cd $TMPDIR
## Determine sample ID
sample=$(head -n ${SLURM_ARRAY_TASK_ID} /home/jyang51/yangfss2/public/ExampleData/Sample_ID.txt | tail -n1)
## Copy raw fastq files to temp idrectory under /scratch
rsync ${data_dir}/RawData/H7JNJDSXC_s1_1_SM_${sample}.fastq.gz ${TMPDIR}/
rsync ${data_dir}/RawData/H7JNJDSXC_s1_2_SM_${sample}.fastq.gz ${TMPDIR}/
###### Scripts that will be run for 52 times to Map 52 samples
## The sample ID will be determined with the given Array_Task_ID from 1 to 52;
## Three input variables are taken by the star_map.sh script;
### Use STAR 2.7.11a: either load STAR module by spack
# spack load star@2.7.11a
### Or install spack by `conda install bioconda::bioconductor-starr` under your virtual environment. And activiate your virtual python environment.
spack load miniconda3@24.3.0
conda activate ~/.conda/envs/myenv
## create temp directory to save map output files
mkdir -p ${TMPDIR}/MAP_OUT/
# Allow use 64GB memory
STAR --genomeDir /home/jyang51/yangfss2/public/GenomeIndex/star_indexes/hg38/ \
--runThreadN ${n_threads} \
--limitBAMsortRAM 64000000000 --genomeLoad NoSharedMemory \
--readFilesIn ${TMPDIR}/H7JNJDSXC_s1_1_SM_${sample}.fastq.gz ${TMPDIR}/H7JNJDSXC_s1_2_SM_${sample}.fastq.gz \
--outFileNamePrefix ${TMPDIR}/MAP_OUT/${sample}. \
--readFilesCommand zcat \
--sjdbInsertSave All \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMtype BAM SortedByCoordinate \
--outSAMunmapped None \
--outSAMattributes Standard
## Copy output files to the output directory
rsync ${TMPDIR}/MAP_OUT/ ${output_dir}/
conda deactivate
rm -fr ${TMPDIR}/
exit
/home/jyang51/yangfss2/public/ExampleData/Sample_ID.txt
by the slurm job id $SLURM_ARRAY_TASK_ID
./scratch/
.