--- title: "Long non-coding RNA discovery pipeline" author: "Zach Bengtsson" date: "`r format(Sys.time(), '%d %B, %Y')`" format: html: toc: true toc-depth: 2 html-math-method: katex css: styles.css theme: sandstone editor: markdown: wrap: 72 --- Zach's attempt First for Apulchra with new genome ```{r setup, include=FALSE} knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` #Variables ```{r setup, include=FALSE} # Global R options knitr::opts_chunk$set(echo = TRUE) # Define key paths and tool directories DATA_DIR <- "../data/01.6-lncRNA-pipeline" OUTPUT_DIR <- "../output/01.6-lncRNA-pipeline" THREADS <- "14" FASTQ_SOURCE <- "https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/" FASTQ_SUFFIX <- "fastq.gz" GENOME_SOURCE <- "https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/D-Apul/data/Apulcra-genome.fa" GTF_SOURCE <- "https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/D-Apul/data/Apulchra-genome.gtf" GFF_SOURCE <- "https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/D-Apul/data/Apulcra-genome.gff" GFFPATTERN <- 'class_code "u"|class_code "x"|class_code "o"|class_code "i"' #RAVEN HISAT2_DIR <- "/home/shared/hisat2-2.2.1/" SAMTOOLS_DIR <- "/home/shared/samtools-1.12/" STRINGTIE_DIR <- "/home/shared/stringtie-2.2.1.Linux_x86_64" GFFCOMPARE_DIR <- "/home/shared/gffcompare-0.12.6.Linux_x86_64" BEDTOOLS_DIR <- "/home/shared/bedtools2/bin" CPC2_DIR <- "/home/shared/CPC2_standalone-1.0.1/bin" CONDA_PATH <- "/opt/anaconda/anaconda3/bin" GENOME_FASTA <- file.path(DATA_DIR, "genome.fasta") GENOME_GTF <- file.path(DATA_DIR, "genome.gtf") GENOME_GFF <- file.path(DATA_DIR, "genome.gff") FASTQ_DIR <- file.path(DATA_DIR, "fastq") GENOME_INDEX <- file.path(OUTPUT_DIR, "genome.index") # Export these as environment variables for bash chunks. Sys.setenv( THREADS = THREADS, DATA_DIR = DATA_DIR, FASTQ_SOURCE = FASTQ_SOURCE, FASTQ_SUFFIX = FASTQ_SUFFIX, OUTPUT_DIR = OUTPUT_DIR, GENOME_SOURCE = GENOME_SOURCE, GTF_SOURCE = GTF_SOURCE, GFF_SOURCE = GFF_SOURCE, HISAT2_DIR = HISAT2_DIR, SAMTOOLS_DIR = SAMTOOLS_DIR, STRINGTIE_DIR = STRINGTIE_DIR, GFFCOMPARE_DIR = GFFCOMPARE_DIR, BEDTOOLS_DIR = BEDTOOLS_DIR, CPC2_DIR = CPC2_DIR, CONDA_PATH = CONDA_PATH, GENOME_FASTA = GENOME_FASTA, GENOME_GTF = GENOME_GTF, GENOME_GFF = GENOME_GFF, FASTQ_DIR = FASTQ_DIR, GENOME_INDEX = GENOME_INDEX ) ``` ```{r mkdir, engine='bash'} mkdir -p "${DATA_DIR}" mkdir -p "${OUTPUT_DIR}" ``` # Download Genome and Reads for Hisat ```{r grab fastq, engine='bash'} wget -nv -r \ --no-directories --no-parent \ -P ${FASTQ_DIR} \ -A "*${FASTQ_SUFFIX}" ${FASTQ_SOURCE} ``` ```{r ls, engine='bash'} ls ${FASTQ_DIR} ``` ```{r fasta, engine='bash'} curl -o "${GENOME_FASTA}" "${GENOME_SOURCE}" ``` ```{r gtf, engine='bash'} curl -o "${GENOME_GTF}" "${GTF_SOURCE}" ``` ```{r gff, engine='bash'} curl -o "${GENOME_GFF}" "${GFF_SOURCE}" ``` ```{r file check, engine='bash'} output_fasta=$(head -1 "${GENOME_FASTA}") output_gff=$(head -2 "${GENOME_GFF}") output_gtf=$(head -1 "${GENOME_GTF}") if [[ "$output_fasta" == *html* || "$output_gff" == *html* || "$output_gtf" == *html* ]]; then echo "FAIL - FFS you downloaded a HTML not and genome feature file!" else echo "$output_fasta" echo "$output_gff" echo "$output_gtf" fi ``` # HISAT ```{r, engine='bash'} "${HISAT2_DIR}hisat2_extract_exons.py" "${GENOME_GTF}" > "${OUTPUT_DIR}/exon.txt" "${HISAT2_DIR}hisat2_extract_splice_sites.py" "${GENOME_GTF}" > "${OUTPUT_DIR}/splice_sites.txt" ``` ```{r, engine='bash'} "${HISAT2_DIR}hisat2-build" \ -p "${THREADS}" \ "${GENOME_FASTA}" \ "${GENOME_INDEX}" \ --exon "${OUTPUT_DIR}/exon.txt" \ --ss "${OUTPUT_DIR}/splice_sites.txt" \ 2> "${OUTPUT_DIR}/hisat2-build_stats.txt" ``` ```{bash} # Loop over every file ending in .fastq.gz that contains "_R2_" for r2 in "${FASTQ_DIR}"/*_R2_*."${FASTQ_SUFFIX}"; do # Get the basename (filename without path) base=$(basename "$r2") # Derive a sample name by taking everything before "_R2_" sample="${base%%_R2_*}" # Construct the corresponding R1 filename by replacing "_R2_" with "_R1_" r1="${r2/_R2_/_R1_}" # Define the output SAM file name using the sample name output="${OUTPUT_DIR}/${sample}.sam" # Run hisat2 with the paired-end files "${HISAT2_DIR}hisat2" \ -x "${GENOME_INDEX}" \ -p "${THREADS}" \ -1 "$r1" \ -2 "$r2" \ -S "$output" done ``` ## convert SAM to BAM ```{r convertSAM, engine='bash'} for samfile in "${OUTPUT_DIR}/${sample}"*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM "${SAMTOOLS_DIR}samtools" view -bS -@ "${THREADS}" "$samfile" > "$bamfile" # Sort BAM "${SAMTOOLS_DIR}samtools" sort -@ "${THREADS}" "$bamfile" -o "$sorted_bamfile" # Index sorted BAM "${SAMTOOLS_DIR}samtools" index -@ "${THREADS}" "$sorted_bamfile" done ``` # StringTie ```{r stringtie, engine='bash'} find "${OUTPUT_DIR}" -name "*sorted.bam" \ | xargs -n 1 basename -s .sorted.bam | xargs -I{} \ "${STRINGTIE_DIR}/stringtie" \ -p "${THREADS}" \ -G "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/{}.gtf" \ "${OUTPUT_DIR}/{}.sorted.bam" ``` ```{r merge, engine='bash'} "${STRINGTIE_DIR}/stringtie" \ --merge \ -G "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/stringtie_merged.gtf" \ "${OUTPUT_DIR}/"*.gtf ``` #GFFCOMPARE ```{r gffcomp, engine='bash'} "${GFFCOMPARE_DIR}/gffcompare" \ -r "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/gffcompare_merged" \ "${OUTPUT_DIR}/stringtie_merged.gtf" ``` ```{r viewgff, engine='bash'} head -1 "${OUTPUT_DIR}"/gffcompare_merged* ``` ```{r gff filter, engine='bash'} awk '$3 == "transcript" && $1 !~ /^#/' "${OUTPUT_DIR}/gffcompare_merged.annotated.gtf" | \ grep -E "${GFFPATTERN}" | \ awk '($5 - $4 > 199) || ($4 - $5 > 199)' > "${OUTPUT_DIR}/lncRNA_candidates.gtf" ``` # Bedtools ```{r fasta, engine='bash'} "${BEDTOOLS_DIR}"/bedtools getfasta \ -fi "${GENOME_FASTA}" \ -bed "${OUTPUT_DIR}/lncRNA_candidates.gtf" \ -fo "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -name -split ``` ```{r, engine='bash', eval=TRUE, echo=FALSE} fgrep -c ">" ${OUTPUT_DIR}/lncRNA_candidates.fasta head ${OUTPUT_DIR}/lncRNA_candidates.fasta ``` #CPC2 ```{r cpc2, engine='bash'} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" python /home/shared/CPC2_standalone-1.0.1/bin/CPC2.py \ -i "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -o "${OUTPUT_DIR}/CPC2" ``` ```{r} install.packages("reticulate") library(reticulate) ``` ```{r} np <- import("numpy") print(np$`__version__`) # Get the file path of numpy's __init__.py (or similar) numpy_location <- np$`__file__` print(numpy_location) ``` #Filter ```{r filCPC, engine='bash'} awk '$8 == "noncoding" {print $1}' "${OUTPUT_DIR}/CPC2.txt" > "${OUTPUT_DIR}/noncoding_transcripts_ids.txt" ``` Subsetting fasta ```{r ssfasta, engine='bash'} "${SAMTOOLS_DIR}samtools" faidx "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -r "${OUTPUT_DIR}/noncoding_transcripts_ids.txt" \ > "${OUTPUT_DIR}/lncRNA.fasta" ``` ```{r, engine='bash'} head -2 "${OUTPUT_DIR}/lncRNA.fasta" fgrep -c ">" ${OUTPUT_DIR}/lncRNA.fasta ``` ```{r lncRNAgtf, engine='bash'} # Define input and output file paths using the OUTPUT_DIR variable input="${OUTPUT_DIR}/noncoding_transcripts_ids.txt" output="${OUTPUT_DIR}/lncRNA.bed" # Process each line of the input file while IFS= read -r line; do # Remove "transcript::" from the line line="${line//transcript::/}" # Split the line by ':' to get the chromosome and position string IFS=':' read -r chromosome pos <<< "$line" # Split the position string by '-' to separate start and end positions IFS='-' read -r start end <<< "$pos" # Convert the start position to 0-based by subtracting 1 start=$((start - 1)) # Write the chromosome, updated start, and end positions to the output file (tab-separated) printf "%s\t%s\t%s\n" "$chromosome" "$start" "$end" done < "$input" > "$output" ``` ```{r renamegtf, engine='bash'} awk 'BEGIN{OFS="\t"; count=1} {printf "%s\t.\tlncRNA\t%d\t%d\t.\t+\t.\tgene_id \"lncRNA_%03d\";\n", $1, $2, $3, count++;}' "${OUTPUT_DIR}/lncRNA.bed" \ > "${OUTPUT_DIR}/lncRNA.gtf" ``` # Summary Table ```{r summary, engine='bash'} tf_file="${OUTPUT_DIR}/lncRNA.gtf" awk ' BEGIN { total_entries = 0; min_length = 1e9; max_length = 0; sum_length = 0; } # Skip comment lines /^#/ { next } { if (NF < 9) next; total_entries++; start = $4; end = $5; gene_length = end - start + 1; if (gene_length < min_length) min_length = gene_length; if (gene_length > max_length) max_length = gene_length; sum_length += gene_length; feature[$3]++; chrom[$1]++; # Use two-argument match() and then extract the gene_id manually. if (match($9, /gene_id "[^"]+"/)) { gene_str = substr($9, RSTART, RLENGTH); # Remove the "gene_id " prefix and the quotes. gsub(/gene_id "/, "", gene_str); gsub(/"/, "", gene_str); genes[gene_str] = 1; } } END { avg_length = (total_entries > 0) ? sum_length / total_entries : 0; unique_gene_count = 0; for (g in genes) unique_gene_count++; print "Basic GTF File Statistics:"; print "--------------------------"; print "Total entries: " total_entries; print "Unique genes: " unique_gene_count; print "Min gene length: " min_length; print "Max gene length: " max_length; printf("Average gene length: %.2f\n", avg_length); print "\nFeature counts:"; for (f in feature) { print " " f ": " feature[f]; } print "\nChromosome counts:"; for (c in chrom) { print " " c ": " chrom[c]; } } ' "$tf_file" ``` ```{python} # Open the input file and the output file with open('../output/01.6-lncRNA-pipeline/noncoding_transcripts_ids.txt', 'r') as infile, open('../output/01.6-lncRNA-pipeline/Apul_lncRNA.bed', 'w') as outfile: # Process each line in the input file for line in infile: # Remove 'transcript::' and then split the line by ':' to extract the relevant parts parts = line.strip().replace('transcript::', '').split(':') chromosome = parts[0] # Split the position part by '-' to get start and end positions start, end = parts[1].split('-') # BED format requires the start position to be 0-based # Convert the start position to 0-based by subtracting 1 start = str(int(start) - 1) # Write the chromosome, start, and end positions to the output file # Separate each field with a tab character outfile.write(f'{chromosome}\t{start}\t{end}\n') # After running this script, 'output.bed' will contain the converted data in BED format. ``` ```{r, engine='bash'} awk 'BEGIN{OFS="\t"; count=1} {printf "%s\t.\tlncRNA\t%d\t%d\t.\t+\t.\tgene_id \"lncRNA_%03d\";\n", $1, $2, $3, count++;}' ../output/01.6-lncRNA-pipeline/Apul_lncRNA.bed \ > ../output/01.6-lncRNA-pipeline/Apul_lncRNAs.gtf ``` # Expression Matrix ```{bash} awk 'BEGIN{OFS="\t"} $2>=0 && $3>$2 {print $1,"converted","lncRNA",$2+1,$3,".","+",".","gene_id \"lncRNA_"NR"\";"}' \ ../output/01.6-lncRNA-pipeline/Apul_lncRNA.bed \ > ../output/01.6-lncRNA-pipeline/Apul_lncRNA_fixed.gtf ``` ```{r, engine='bash'} /home/shared/subread-2.0.5-Linux-x86_64/bin/featureCounts \ -T 42 \ -a ../output/01.6-lncRNA-pipeline/Apul_lncRNA_fixed.gtf \ -o ../output/01.6-lncRNA-pipeline/Apul_counts.txt \ -t lncRNA \ -g gene_id \ -p \ ../output/01.6-lncRNA-pipeline/*sorted.bam ``` For Peve #Variables ```{r setup, include=FALSE} # Global R options knitr::opts_chunk$set(echo = TRUE) # Define key paths and tool directories DATA_DIR <- "../data/01.61-lncRNA-pipeline" OUTPUT_DIR <- "../output/01.61-lncRNA-pipeline" THREADS <- "14" FASTQ_SOURCE <- "https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/P_evermanni/trimmed/" FASTQ_SUFFIX <- "fastq.gz" GENOME_SOURCE <- "https://gannet.fish.washington.edu/seashell/snaps/Porites_evermanni_v1.fa" GTF_SOURCE <- "https://raw.githubusercontent.com/urol-e5/deep-dive-expression/b0290e08af4eaeed30d74a758965debef6111801/E-Peve/data/Porites_evermanni_validated.gtf" GFF_SOURCE <- "https://raw.githubusercontent.com/urol-e5/deep-dive-expression/b0290e08af4eaeed30d74a758965debef6111801/E-Peve/data/Porites_evermanni_validated.gff3" GFFPATTERN <- 'class_code "u"|class_code "x"|class_code "o"|class_code "i"' #RAVEN HISAT2_DIR <- "/home/shared/hisat2-2.2.1/" SAMTOOLS_DIR <- "/home/shared/samtools-1.12/" STRINGTIE_DIR <- "/home/shared/stringtie-2.2.1.Linux_x86_64" GFFCOMPARE_DIR <- "/home/shared/gffcompare-0.12.6.Linux_x86_64" BEDTOOLS_DIR <- "/home/shared/bedtools2/bin" CPC2_DIR <- "/home/shared/CPC2_standalone-1.0.1/bin" CONDA_PATH <- "/opt/anaconda/anaconda3/bin" GENOME_FASTA <- file.path(DATA_DIR, "genome.fasta") GENOME_GTF <- file.path(DATA_DIR, "genome.gtf") GENOME_GFF <- file.path(DATA_DIR, "genome.gff") FASTQ_DIR <- file.path(DATA_DIR, "fastq") GENOME_INDEX <- file.path(OUTPUT_DIR, "genome.index") # Export these as environment variables for bash chunks. Sys.setenv( THREADS = THREADS, DATA_DIR = DATA_DIR, FASTQ_SOURCE = FASTQ_SOURCE, FASTQ_SUFFIX = FASTQ_SUFFIX, OUTPUT_DIR = OUTPUT_DIR, GENOME_SOURCE = GENOME_SOURCE, GTF_SOURCE = GTF_SOURCE, GFF_SOURCE = GFF_SOURCE, HISAT2_DIR = HISAT2_DIR, SAMTOOLS_DIR = SAMTOOLS_DIR, STRINGTIE_DIR = STRINGTIE_DIR, GFFCOMPARE_DIR = GFFCOMPARE_DIR, BEDTOOLS_DIR = BEDTOOLS_DIR, CPC2_DIR = CPC2_DIR, CONDA_PATH = CONDA_PATH, GENOME_FASTA = GENOME_FASTA, GENOME_GTF = GENOME_GTF, GENOME_GFF = GENOME_GFF, FASTQ_DIR = FASTQ_DIR, GENOME_INDEX = GENOME_INDEX ) ``` ```{r mkdir, engine='bash'} mkdir -p "${DATA_DIR}" mkdir -p "${OUTPUT_DIR}" ``` # Download Genome and Reads for Hisat ```{r grab fastq, engine='bash'} wget -nv -r \ --no-directories --no-parent \ -P ${FASTQ_DIR} \ -A "*${FASTQ_SUFFIX}" ${FASTQ_SOURCE} ``` ```{r ls, engine='bash'} ls ${FASTQ_DIR} ``` ```{r fasta, engine='bash'} curl -o "${GENOME_FASTA}" "${GENOME_SOURCE}" ``` ```{r gtf, engine='bash'} curl -o "${GENOME_GTF}" "${GTF_SOURCE}" ``` ```{r gff, engine='bash'} curl -o "${GENOME_GFF}" "${GFF_SOURCE}" ``` ```{r file check, engine='bash'} output_fasta=$(head -1 "${GENOME_FASTA}") output_gff=$(head -2 "${GENOME_GFF}") output_gtf=$(head -1 "${GENOME_GTF}") if [[ "$output_fasta" == *html* || "$output_gff" == *html* || "$output_gtf" == *html* ]]; then echo "FAIL - FFS you downloaded a HTML not and genome feature file!" else echo "$output_fasta" echo "$output_gff" echo "$output_gtf" fi ``` # HISAT ```{r, engine='bash'} "${HISAT2_DIR}/hisat2_extract_exons.py" "${GENOME_GTF}" > "${OUTPUT_DIR}/exon.txt" "${HISAT2_DIR}/hisat2_extract_splice_sites.py" "${GENOME_GTF}" > "${OUTPUT_DIR}/splice_sites.txt" ``` ```{r, engine='bash'} "${HISAT2_DIR}hisat2-build" \ -p "${THREADS}" \ "${GENOME_FASTA}" \ "${GENOME_INDEX}" \ --exon "${OUTPUT_DIR}/exon.txt" \ --ss "${OUTPUT_DIR}/splice_sites.txt" \ 2> "${OUTPUT_DIR}/hisat2-build_stats.txt" ``` ```{bash} # Loop over every file ending in .fastq.gz that contains "_R2_" for r2 in "${FASTQ_DIR}"/*_R2_*."${FASTQ_SUFFIX}"; do # Get the basename (filename without path) base=$(basename "$r2") # Derive a sample name by taking everything before "_R2_" sample="${base%%_R2_*}" # Construct the corresponding R1 filename by replacing "_R2_" with "_R1_" r1="${r2/_R2_/_R1_}" # Define the output SAM file name using the sample name output="${OUTPUT_DIR}/${sample}.sam" # Run hisat2 with the paired-end files "${HISAT2_DIR}hisat2" \ -x "${GENOME_INDEX}" \ -p "${THREADS}" \ -1 "$r1" \ -2 "$r2" \ -S "$output" done ``` ## convert SAM to BAM ```{r convertSAM, engine='bash'} for samfile in "${OUTPUT_DIR}/${sample}"*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM "${SAMTOOLS_DIR}samtools" view -bS -@ "${THREADS}" "$samfile" > "$bamfile" # Sort BAM "${SAMTOOLS_DIR}samtools" sort -@ "${THREADS}" "$bamfile" -o "$sorted_bamfile" # Index sorted BAM "${SAMTOOLS_DIR}samtools" index -@ "${THREADS}" "$sorted_bamfile" done ``` # StringTie ```{r stringtie, engine='bash'} find "${OUTPUT_DIR}" -name "*sorted.bam" \ | xargs -n 1 basename -s .sorted.bam | xargs -I{} \ "${STRINGTIE_DIR}/stringtie" \ -p "${THREADS}" \ -G "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/{}.gtf" \ "${OUTPUT_DIR}/{}.sorted.bam" ``` ```{r merge, engine='bash'} "${STRINGTIE_DIR}/stringtie" \ --merge \ -G "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/stringtie_merged.gtf" \ "${OUTPUT_DIR}/"*.gtf ``` #GFFCOMPARE ```{r gffcomp, engine='bash'} "${GFFCOMPARE_DIR}/gffcompare" \ -r "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/gffcompare_merged" \ "${OUTPUT_DIR}/stringtie_merged.gtf" ``` ```{r viewgff, engine='bash'} head -1 "${OUTPUT_DIR}"/gffcompare_merged* ``` ```{r gff filter, engine='bash'} awk '$3 == "transcript" && $1 !~ /^#/' "${OUTPUT_DIR}/gffcompare_merged.annotated.gtf" | \ grep -E "${GFFPATTERN}" | \ awk '($5 - $4 > 199) || ($4 - $5 > 199)' > "${OUTPUT_DIR}/lncRNA_candidates.gtf" ``` # Bedtools ```{r fasta, engine='bash'} "${BEDTOOLS_DIR}"/bedtools getfasta \ -fi "${GENOME_FASTA}" \ -bed "${OUTPUT_DIR}/lncRNA_candidates.gtf" \ -fo "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -name -split ``` ```{r, engine='bash', eval=TRUE, echo=FALSE} fgrep -c ">" ${OUTPUT_DIR}/lncRNA_candidates.fasta head ${OUTPUT_DIR}/lncRNA_candidates.fasta ``` #CPC2 ```{r cpc2, engine='bash'} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" python /home/shared/CPC2_standalone-1.0.1/bin/CPC2.py \ -i "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -o "${OUTPUT_DIR}/CPC2" ``` ```{r} install.packages("reticulate") library(reticulate) ``` ```{r} np <- import("numpy") print(np$`__version__`) # Get the file path of numpy's __init__.py (or similar) numpy_location <- np$`__file__` print(numpy_location) ``` #Filter ```{r filCPC, engine='bash'} awk '$8 == "noncoding" {print $1}' "${OUTPUT_DIR}/CPC2.txt" > "${OUTPUT_DIR}/noncoding_transcripts_ids.txt" ``` Subsetting fasta ```{r ssfasta, engine='bash'} "${SAMTOOLS_DIR}samtools" faidx "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -r "${OUTPUT_DIR}/noncoding_transcripts_ids.txt" \ > "${OUTPUT_DIR}/lncRNA.fasta" ``` ```{r, engine='bash'} head -2 "${OUTPUT_DIR}/lncRNA.fasta" fgrep -c ">" ${OUTPUT_DIR}/lncRNA.fasta ``` ```{r lncRNAgtf, engine='bash'} # Define input and output file paths using the OUTPUT_DIR variable input="${OUTPUT_DIR}/noncoding_transcripts_ids.txt" output="${OUTPUT_DIR}/lncRNA.bed" # Process each line of the input file while IFS= read -r line; do # Remove "transcript::" from the line line="${line//transcript::/}" # Split the line by ':' to get the chromosome and position string IFS=':' read -r chromosome pos <<< "$line" # Split the position string by '-' to separate start and end positions IFS='-' read -r start end <<< "$pos" # Convert the start position to 0-based by subtracting 1 start=$((start - 1)) # Write the chromosome, updated start, and end positions to the output file (tab-separated) printf "%s\t%s\t%s\n" "$chromosome" "$start" "$end" done < "$input" > "$output" ``` ```{r renamegtf, engine='bash'} awk 'BEGIN{OFS="\t"; count=1} {printf "%s\t.\tlncRNA\t%d\t%d\t.\t+\t.\tgene_id \"lncRNA_%03d\";\n", $1, $2, $3, count++;}' "${OUTPUT_DIR}/lncRNA.bed" \ > "${OUTPUT_DIR}/lncRNA.gtf" ``` # Summary Table ```{r summary, engine='bash'} tf_file="${OUTPUT_DIR}/lncRNA.gtf" awk ' BEGIN { total_entries = 0; min_length = 1e9; max_length = 0; sum_length = 0; } # Skip comment lines /^#/ { next } { if (NF < 9) next; total_entries++; start = $4; end = $5; gene_length = end - start + 1; if (gene_length < min_length) min_length = gene_length; if (gene_length > max_length) max_length = gene_length; sum_length += gene_length; feature[$3]++; chrom[$1]++; # Use two-argument match() and then extract the gene_id manually. if (match($9, /gene_id "[^"]+"/)) { gene_str = substr($9, RSTART, RLENGTH); # Remove the "gene_id " prefix and the quotes. gsub(/gene_id "/, "", gene_str); gsub(/"/, "", gene_str); genes[gene_str] = 1; } } END { avg_length = (total_entries > 0) ? sum_length / total_entries : 0; unique_gene_count = 0; for (g in genes) unique_gene_count++; print "Basic GTF File Statistics:"; print "--------------------------"; print "Total entries: " total_entries; print "Unique genes: " unique_gene_count; print "Min gene length: " min_length; print "Max gene length: " max_length; printf("Average gene length: %.2f\n", avg_length); print "\nFeature counts:"; for (f in feature) { print " " f ": " feature[f]; } print "\nChromosome counts:"; for (c in chrom) { print " " c ": " chrom[c]; } } ' "$tf_file" ``` ```{python} # Open the input file and the output file with open('../output/01.61-lncRNA-pipeline/noncoding_transcripts_ids.txt', 'r') as infile, open('../output/01.61-lncRNA-pipeline/Peve_lncRNA.bed', 'w') as outfile: # Process each line in the input file for line in infile: # Remove 'transcript::' and then split the line by ':' to extract the relevant parts parts = line.strip().replace('transcript::', '').split(':') chromosome = parts[0] # Split the position part by '-' to get start and end positions start, end = parts[1].split('-') # BED format requires the start position to be 0-based # Convert the start position to 0-based by subtracting 1 start = str(int(start) - 1) # Write the chromosome, start, and end positions to the output file # Separate each field with a tab character outfile.write(f'{chromosome}\t{start}\t{end}\n') # After running this script, 'output.bed' will contain the converted data in BED format. ``` Skip to expression matrix step ```{r, engine='bash'} awk 'BEGIN{OFS="\t"; count=1} {printf "%s\t.\tlncRNA\t%d\t%d\t.\t+\t.\tgene_id \"lncRNA_%03d\";\n", $1, $2, $3, count++;}' ../output/01.6-lncRNA-pipeline/Apul_lncRNA.bed \ > ../output/01.6-lncRNA-pipeline/Apul_lncRNAs.gtf ``` # Expression Matrix ```{bash} awk 'BEGIN{OFS="\t"} $2>=0 && $3>$2 {print $1,"converted","lncRNA",$2+1,$3,".","+",".","gene_id \"lncRNA_"NR"\";"}' \ ../output/01.61-lncRNA-pipeline/Peve_lncRNA.bed \ > ../output/01.61-lncRNA-pipeline/Peve_lncRNA_fixed.gtf ``` ```{r, engine='bash'} /home/shared/subread-2.0.5-Linux-x86_64/bin/featureCounts \ -T 42 \ -a ../output/01.61-lncRNA-pipeline/Peve_lncRNA_fixed.gtf \ -o ../output/01.61-lncRNA-pipeline/Peve_counts.txt \ -t lncRNA \ -g gene_id \ -p \ ../output/01.61-lncRNA-pipeline/*sorted.bam ``` For Ptuh #Variables ```{r setup, include=FALSE} # Global R options knitr::opts_chunk$set(echo = TRUE) # Define key paths and tool directories DATA_DIR <- "../data/01.62-lncRNA-pipeline" OUTPUT_DIR <- "../output/01.62-lncRNA-pipeline" THREADS <- "14" FASTQ_SOURCE <- "https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/P_meandrina/trimmed/" FASTQ_SUFFIX <- "fastq.gz" GENOME_SOURCE <- "https://owl.fish.washington.edu/halfshell/genomic-databank/Pocillopora_meandrina_HIv1.assembly.fasta" GTF_SOURCE <- "https://raw.githubusercontent.com/urol-e5/deep-dive-expression/f62c6d01e04ef0007f2f53af84181481d64d29c1/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gtf" GFF_SOURCE <- "https://raw.githubusercontent.com/urol-e5/deep-dive-expression/f62c6d01e04ef0007f2f53af84181481d64d29c1/F-Ptuh/data/Pocillopora_meandrina_HIv1.genes-validated.gff3" GFFPATTERN <- 'class_code "u"|class_code "x"|class_code "o"|class_code "i"' #RAVEN HISAT2_DIR <- "/home/shared/hisat2-2.2.1/" SAMTOOLS_DIR <- "/home/shared/samtools-1.12/" STRINGTIE_DIR <- "/home/shared/stringtie-2.2.1.Linux_x86_64" GFFCOMPARE_DIR <- "/home/shared/gffcompare-0.12.6.Linux_x86_64" BEDTOOLS_DIR <- "/home/shared/bedtools2/bin" CPC2_DIR <- "/home/shared/CPC2_standalone-1.0.1/bin" CONDA_PATH <- "/opt/anaconda/anaconda3/bin" GENOME_FASTA <- file.path(DATA_DIR, "genome.fasta") GENOME_GTF <- file.path(DATA_DIR, "genome.gtf") GENOME_GFF <- file.path(DATA_DIR, "genome.gff") FASTQ_DIR <- file.path(DATA_DIR, "fastq") GENOME_INDEX <- file.path(OUTPUT_DIR, "genome.index") # Export these as environment variables for bash chunks. Sys.setenv( THREADS = THREADS, DATA_DIR = DATA_DIR, FASTQ_SOURCE = FASTQ_SOURCE, FASTQ_SUFFIX = FASTQ_SUFFIX, OUTPUT_DIR = OUTPUT_DIR, GENOME_SOURCE = GENOME_SOURCE, GTF_SOURCE = GTF_SOURCE, GFF_SOURCE = GFF_SOURCE, HISAT2_DIR = HISAT2_DIR, SAMTOOLS_DIR = SAMTOOLS_DIR, STRINGTIE_DIR = STRINGTIE_DIR, GFFCOMPARE_DIR = GFFCOMPARE_DIR, BEDTOOLS_DIR = BEDTOOLS_DIR, CPC2_DIR = CPC2_DIR, CONDA_PATH = CONDA_PATH, GENOME_FASTA = GENOME_FASTA, GENOME_GTF = GENOME_GTF, GENOME_GFF = GENOME_GFF, FASTQ_DIR = FASTQ_DIR, GENOME_INDEX = GENOME_INDEX ) ``` ```{r mkdir, engine='bash'} mkdir -p "${DATA_DIR}" mkdir -p "${OUTPUT_DIR}" ``` # Download Genome and Reads for Hisat ```{r grab fastq, engine='bash'} wget -nv -r \ --no-directories --no-parent \ -P ${FASTQ_DIR} \ -A "*${FASTQ_SUFFIX}" ${FASTQ_SOURCE} ``` ```{r ls, engine='bash'} ls ${FASTQ_DIR} ``` ```{r fasta, engine='bash'} curl -o "${GENOME_FASTA}" "${GENOME_SOURCE}" ``` ```{r gtf, engine='bash'} curl -o "${GENOME_GTF}" "${GTF_SOURCE}" ``` ```{r gff, engine='bash'} curl -o "${GENOME_GFF}" "${GFF_SOURCE}" ``` ```{r file check, engine='bash'} output_fasta=$(head -1 "${GENOME_FASTA}") output_gff=$(head -2 "${GENOME_GFF}") output_gtf=$(head -1 "${GENOME_GTF}") if [[ "$output_fasta" == *html* || "$output_gff" == *html* || "$output_gtf" == *html* ]]; then echo "FAIL - FFS you downloaded a HTML not and genome feature file!" else echo "$output_fasta" echo "$output_gff" echo "$output_gtf" fi ``` # HISAT ```{r, engine='bash'} "${HISAT2_DIR}/hisat2_extract_exons.py" "${GENOME_GTF}" > "${OUTPUT_DIR}/exon.txt" "${HISAT2_DIR}/hisat2_extract_splice_sites.py" "${GENOME_GTF}" > "${OUTPUT_DIR}/splice_sites.txt" ``` ```{r, engine='bash'} "${HISAT2_DIR}hisat2-build" \ -p "${THREADS}" \ "${GENOME_FASTA}" \ "${GENOME_INDEX}" \ --exon "${OUTPUT_DIR}/exon.txt" \ --ss "${OUTPUT_DIR}/splice_sites.txt" \ 2> "${OUTPUT_DIR}/hisat2-build_stats.txt" ``` ```{bash} # Loop over every file ending in .fastq.gz that contains "_R2_" for r2 in "${FASTQ_DIR}"/*_R2_*."${FASTQ_SUFFIX}"; do # Get the basename (filename without path) base=$(basename "$r2") # Derive a sample name by taking everything before "_R2_" sample="${base%%_R2_*}" # Construct the corresponding R1 filename by replacing "_R2_" with "_R1_" r1="${r2/_R2_/_R1_}" # Define the output SAM file name using the sample name output="${OUTPUT_DIR}/${sample}.sam" # Run hisat2 with the paired-end files "${HISAT2_DIR}hisat2" \ -x "${GENOME_INDEX}" \ -p "${THREADS}" \ -1 "$r1" \ -2 "$r2" \ -S "$output" done ``` ## convert SAM to BAM ```{r convertSAM, engine='bash'} for samfile in "${OUTPUT_DIR}/${sample}"*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM "${SAMTOOLS_DIR}samtools" view -bS -@ "${THREADS}" "$samfile" > "$bamfile" # Sort BAM "${SAMTOOLS_DIR}samtools" sort -@ "${THREADS}" "$bamfile" -o "$sorted_bamfile" # Index sorted BAM "${SAMTOOLS_DIR}samtools" index -@ "${THREADS}" "$sorted_bamfile" done ``` # StringTie ```{r stringtie, engine='bash'} find "${OUTPUT_DIR}" -name "*sorted.bam" \ | xargs -n 1 basename -s .sorted.bam | xargs -I{} \ "${STRINGTIE_DIR}/stringtie" \ -p "${THREADS}" \ -G "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/{}.gtf" \ "${OUTPUT_DIR}/{}.sorted.bam" ``` ```{r merge, engine='bash'} "${STRINGTIE_DIR}/stringtie" \ --merge \ -G "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/stringtie_merged.gtf" \ "${OUTPUT_DIR}/"*.gtf ``` #GFFCOMPARE ```{r gffcomp, engine='bash'} "${GFFCOMPARE_DIR}/gffcompare" \ -r "${GENOME_GFF}" \ -o "${OUTPUT_DIR}/gffcompare_merged" \ "${OUTPUT_DIR}/stringtie_merged.gtf" ``` ```{r viewgff, engine='bash'} head -1 "${OUTPUT_DIR}"/gffcompare_merged* ``` ```{r gff filter, engine='bash'} awk '$3 == "transcript" && $1 !~ /^#/' "${OUTPUT_DIR}/gffcompare_merged.annotated.gtf" | \ grep -E "${GFFPATTERN}" | \ awk '($5 - $4 > 199) || ($4 - $5 > 199)' > "${OUTPUT_DIR}/lncRNA_candidates.gtf" ``` # Bedtools ```{r fasta, engine='bash'} "${BEDTOOLS_DIR}"/bedtools getfasta \ -fi "${GENOME_FASTA}" \ -bed "${OUTPUT_DIR}/lncRNA_candidates.gtf" \ -fo "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -name -split ``` ```{r, engine='bash', eval=TRUE, echo=FALSE} fgrep -c ">" ${OUTPUT_DIR}/lncRNA_candidates.fasta head ${OUTPUT_DIR}/lncRNA_candidates.fasta ``` #CPC2 ```{r cpc2, engine='bash'} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" python /home/shared/CPC2_standalone-1.0.1/bin/CPC2.py \ -i "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -o "${OUTPUT_DIR}/CPC2" ``` ```{r} install.packages("reticulate") library(reticulate) ``` ```{r} np <- import("numpy") print(np$`__version__`) # Get the file path of numpy's __init__.py (or similar) numpy_location <- np$`__file__` print(numpy_location) ``` #Filter ```{r filCPC, engine='bash'} awk '$8 == "noncoding" {print $1}' "${OUTPUT_DIR}/CPC2.txt" > "${OUTPUT_DIR}/noncoding_transcripts_ids.txt" ``` Subsetting fasta ```{r ssfasta, engine='bash'} "${SAMTOOLS_DIR}samtools" faidx "${OUTPUT_DIR}/lncRNA_candidates.fasta" \ -r "${OUTPUT_DIR}/noncoding_transcripts_ids.txt" \ > "${OUTPUT_DIR}/lncRNA.fasta" ``` ```{r, engine='bash'} head -2 "${OUTPUT_DIR}/lncRNA.fasta" fgrep -c ">" ${OUTPUT_DIR}/lncRNA.fasta ``` ```{r lncRNAgtf, engine='bash'} # Define input and output file paths using the OUTPUT_DIR variable input="${OUTPUT_DIR}/noncoding_transcripts_ids.txt" output="${OUTPUT_DIR}/lncRNA.bed" # Process each line of the input file while IFS= read -r line; do # Remove "transcript::" from the line line="${line//transcript::/}" # Split the line by ':' to get the chromosome and position string IFS=':' read -r chromosome pos <<< "$line" # Split the position string by '-' to separate start and end positions IFS='-' read -r start end <<< "$pos" # Convert the start position to 0-based by subtracting 1 start=$((start - 1)) # Write the chromosome, updated start, and end positions to the output file (tab-separated) printf "%s\t%s\t%s\n" "$chromosome" "$start" "$end" done < "$input" > "$output" ``` ```{r renamegtf, engine='bash'} awk 'BEGIN{OFS="\t"; count=1} {printf "%s\t.\tlncRNA\t%d\t%d\t.\t+\t.\tgene_id \"lncRNA_%03d\";\n", $1, $2, $3, count++;}' "${OUTPUT_DIR}/lncRNA.bed" \ > "${OUTPUT_DIR}/lncRNA.gtf" ``` # Summary Table ```{r summary, engine='bash'} tf_file="${OUTPUT_DIR}/lncRNA.gtf" awk ' BEGIN { total_entries = 0; min_length = 1e9; max_length = 0; sum_length = 0; } # Skip comment lines /^#/ { next } { if (NF < 9) next; total_entries++; start = $4; end = $5; gene_length = end - start + 1; if (gene_length < min_length) min_length = gene_length; if (gene_length > max_length) max_length = gene_length; sum_length += gene_length; feature[$3]++; chrom[$1]++; # Use two-argument match() and then extract the gene_id manually. if (match($9, /gene_id "[^"]+"/)) { gene_str = substr($9, RSTART, RLENGTH); # Remove the "gene_id " prefix and the quotes. gsub(/gene_id "/, "", gene_str); gsub(/"/, "", gene_str); genes[gene_str] = 1; } } END { avg_length = (total_entries > 0) ? sum_length / total_entries : 0; unique_gene_count = 0; for (g in genes) unique_gene_count++; print "Basic GTF File Statistics:"; print "--------------------------"; print "Total entries: " total_entries; print "Unique genes: " unique_gene_count; print "Min gene length: " min_length; print "Max gene length: " max_length; printf("Average gene length: %.2f\n", avg_length); print "\nFeature counts:"; for (f in feature) { print " " f ": " feature[f]; } print "\nChromosome counts:"; for (c in chrom) { print " " c ": " chrom[c]; } } ' "$tf_file" ``` ```{python} # Open the input file and the output file with open('../output/01.62-lncRNA-pipeline/noncoding_transcripts_ids.txt', 'r') as infile, open('../output/01.62-lncRNA-pipeline/Ptuh_lncRNA.bed', 'w') as outfile: # Process each line in the input file for line in infile: # Remove 'transcript::' and then split the line by ':' to extract the relevant parts parts = line.strip().replace('transcript::', '').split(':') chromosome = parts[0] # Split the position part by '-' to get start and end positions start, end = parts[1].split('-') # BED format requires the start position to be 0-based # Convert the start position to 0-based by subtracting 1 start = str(int(start) - 1) # Write the chromosome, start, and end positions to the output file # Separate each field with a tab character outfile.write(f'{chromosome}\t{start}\t{end}\n') # After running this script, 'output.bed' will contain the converted data in BED format. ``` Skip to expression matrix step ```{r, engine='bash'} awk 'BEGIN{OFS="\t"; count=1} {printf "%s\t.\tlncRNA\t%d\t%d\t.\t+\t.\tgene_id \"lncRNA_%03d\";\n", $1, $2, $3, count++;}' ../output/01.6-lncRNA-pipeline/Apul_lncRNA.bed \ > ../output/01.6-lncRNA-pipeline/Apul_lncRNAs.gtf ``` # Expression Matrix ```{bash} awk 'BEGIN{OFS="\t"} $2>=0 && $3>$2 {print $1,"converted","lncRNA",$2+1,$3,".","+",".","gene_id \"lncRNA_"NR"\";"}' \ ../output/01.61-lncRNA-pipeline/Peve_lncRNA.bed \ > ../output/01.61-lncRNA-pipeline/Peve_lncRNA_fixed.gtf ``` ```{r, engine='bash'} /home/shared/subread-2.0.5-Linux-x86_64/bin/featureCounts \ -T 42 \ -a ../output/01.61-lncRNA-pipeline/Peve_lncRNA_fixed.gtf \ -o ../output/01.61-lncRNA-pipeline/Peve_counts.txt \ -t lncRNA \ -g gene_id \ -p \ ../output/01.61-lncRNA-pipeline/*sorted.bam ```