--- title: "18-Apul-piRNA-proTRAC" author: "Javier Rodriguez-Casariego" date: "`r Sys.Date()`" output: html_document --- ```{r, engine='bash'} # 1) Remove redundant reads from trimmed fastqs, keep reads between 25 and 35 nt (filtering out siRNA and mature miRNA), remove low complexity reads and change the format for small RNA alignment for f in /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/20240513_PIWI_pipeline/APUL/*.fq do perl NGSToolbox/TBr2_collapse.pl -i ${f} -o ${f}.collapsed perl NGSToolbox/TBr2_length-filter.pl -i ${f}.collapsed -o ${f}.collapsed.filt -min 25 -max 35 perl NGSToolbox/TBr2_duster.pl -i ${f}.collapsed.filt done ``` ```{r, engine='bash'} # concatenate reads in a single file cat /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/20240513_PIWI_pipeline/APUL/*.collapsed.filt.no-dust > /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/20240513_PIWI_pipeline/APUL_filt_merged.fq ``` ```{r, engine='bash'} # Filter annotated rRNA, tRNA, snRNAs, miRNAs, and snoRNAs reads (based in annotations on gff file non of these are piwi or related. ## build a reference dataset of annotated non-codingRNAs grep $'\tsnoRNA\t' GCF_013753865.1_Amil_v2.1_genomic.gff > AMIL.GFFannotation.snoRNA.gff grep $'\tsnRNA\t' GCF_013753865.1_Amil_v2.1_genomic.gff > AMIL.GFFannotation.snRNA.gff grep $'\ttRNA\t' GCF_013753865.1_Amil_v2.1_genomic.gff > AMIL.GFFannotation.tRNA.gff grep $'\trRNA\t' GCF_013753865.1_Amil_v2.1_genomic.gff > AMIL.GFFannotation.rRNA.gff bedtools getfasta -fi GCF_013753865.1_Amil_v2.1_genomic.fna -bed AMIL.GFFannotation.snoRNA.gff -fo AMIL_snoRNA.fasta bedtools getfasta -fi GCF_013753865.1_Amil_v2.1_genomic.fna -bed AMIL.GFFannotation.snRNA.gff -fo AMIL_snRNA.fasta bedtools getfasta -fi GCF_013753865.1_Amil_v2.1_genomic.fna -bed AMIL.GFFannotation.tRNA.gff -fo AMIL_tRNA.fasta bedtools getfasta -fi GCF_013753865.1_Amil_v2.1_genomic.fna -bed AMIL.GFFannotation.rRNA.gff -fo AMIL_rRNA.fasta grep -v '^#' APUL_ShortStackOut.gff3.gff3 | cut -s -f 3 | sort | uniq -c | sort -rn grep $'\tsiRNA' APUL_ShortStackOut.gff3 > APUL.GFFannotation.siRNA.gff grep $'\tMIRNA_hairpin\t' APUL_ShortStackOut.gff3 > APUL.GFFannotation.precursor_miRNA.gff bedtools getfasta -fi GCF_013753865.1_Amil_v2.1_genomic.fna -bed APUL.GFFannotation.siRNA.gff -fo APUL_siRNA.fasta bedtools getfasta -fi GCF_013753865.1_Amil_v2.1_genomic.fna -bed APUL.GFFannotation.precursor_miRNA.gff -fo APUL_precursor_miRNA.fasta cat *fasta > APUL_otherSmallRNAs.fasta ``` ```{r, engine='bash'} ## use sortmerna to clear reads that match the fasta sortmerna \ --ref /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/genomes/APUL/APUL_otherSmallRNAs.filt.fasta \ --reads /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/20240513_PIWI_pipeline/APUL_filt_merged.fq \ --workdir /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/20240514_sortMerna/APUL/ \ --fastx \ --other # about 1-3% of reads mapped to known small RNAs. Move unmapped reads to work folder. ``` ```{r, engine='bash'} # align "clean" reads to the genome cp 20240514_sortMerna/APUL/other.fq 20240513_PIWI_pipeline/APUL_merged_preproc # simplify fasta headers on genome to include only the scaffold or chr name. proTRAC dont go well with long headers sed 's/^>\([^ ]*\).*/>\1/' GCF_013753865.1_Amil_v2.1_genomic.fna > GCF_013753865.1_Amil_v2.1_genomic.fna perl code/NGSToolbox/sRNAmapper.pl \ -input 20240513_PIWI_pipeline/APUL_merged_preproc \ -genome /scratch/jeirinlo/jrodr979/sncRNA_E5_corals/genomes/APUL/GCF_013753865.1_Amil_v2.1_genomic.fna \ -alignments best ``` ```{r, engine='bash'} # define origin for multiple mapping reads using reallocate perl code/NGSToolbox/reallocate.pl APUL_merged_preproc.map 10000 1000 b 0 ``` ```{r, engine='bash'} # run proTRAC using the gene tracks and the RepeatMasker output perl ../code/NGSToolbox/proTRAC_2.4.2.pl \ -map APUL_merged_preproc.map.weighted-10000-1000-b-0 \ -genome ../genomes/APUL/GCF_013753865.1_Amil_v2.1_genomic.fna \ -repeatmasker ../genomes/APUL/GCF_013753865.1_Amil_v2.1_genomic.fna.out \ -geneset ../genomes/APUL/genomic.gtf # Output folder is proTRAC_APUL_merged_preproc.map.weighted-10000-1000-b-0_2024y5m20d9h58m12s ``` ```{r, engine='bash'} for f in *.no-dust do mkdir /scratch/jeirinlo/javirodr/sncRNA_E5_corals/20240617_ProTRAC_individual_samples/APUL/sortmerna/${f/-S1-TP2-fastp-adapters-polyG-31bp-merged.fq.collapsed.filt.no-dust} sortmerna \ --ref /scratch/jeirinlo/javirodr/sncRNA_E5_corals/genomes/APUL/APUL_otherSmallRNAs.fasta \ --reads /scratch/jeirinlo/javirodr/sncRNA_E5_corals/20240617_ProTRAC_individual_samples/APUL/${f} \ --workdir /scratch/jeirinlo/javirodr/sncRNA_E5_corals/20240617_ProTRAC_individual_samples/APUL/sortmerna/${f/-S1-TP2-fastp-adapters-polyG-31bp-merged.fq.collapsed.filt.no-dust} \ --fastx \ --other done for f in *preproc do perl /scratch/jeirinlo/javirodr/sncRNA_E5_corals/code/NGSToolbox/sRNAmapper.pl \ -input ${f} \ -genome /scratch/jeirinlo/javirodr/sncRNA_E5_corals/genomes/APUL/GCF_013753865.1_Amil_v2.1_genomic.fna \ -alignments best done for f in *map do perl /scratch/jeirinlo/javirodr/sncRNA_E5_corals/code/NGSToolbox/reallocate.pl ${f} 10000 1000 b 0 done for f in *.weighted-10000-1000-b-0 do perl /scratch/jeirinlo/javirodr/sncRNA_E5_corals/code/NGSToolbox/proTRAC_2.4.4.pl \ -map ${f} \ -genome /scratch/jeirinlo/javirodr/sncRNA_E5_corals/genomes/APUL/GCF_013753865.1_Amil_v2.1_genomic.fna \ -repeatmasker /scratch/jeirinlo/javirodr/sncRNA_E5_corals/genomes/APUL/GCF_013753865.1_Amil_v2.1_genomic.fna.out \ -geneset /scratch/jeirinlo/javirodr/sncRNA_E5_corals/genomes/APUL/genomic.gtf done # Parse proTRAC output into a 5-bedfile with cluster coordinates, a summary file with general stats, and a .tab file with other cluster characteristics # use the parse_proTRAC.pl script developed by https://github.com/agata-sm/ # create a results folder and move result.table there mkdir proTRAC_results_APUL for f in proTRAC_sRNA-ACR-1* do cp ${f}/results.table proTRAC_results_APUL/results.table.${f/_preproc.map.weighted-10000-100*} done for f in proTRAC_results_APUL/results.table* do perl /scratch/jeirinlo/javirodr/sncRNA_E5_corals/code/parse_proTRAC.pl \ --infile ${f} \ --outdir proTRAC_results_APUL/ done # Merge clusters across samples. To get a good representation of clusters I selected clusters which are present in at least two replicates, and the overlap is by at least 50% of the cluster length and is reciprocal (i.e. A overlaps 50% of B and B overlaps 50% of A): # Using bedtools first: FILES=(proTRAC_results_APUL/*.proTRAC.bed) NUM_FILES=${#FILES[@]} # Loop over each file for (( i=0; i<$NUM_FILES; i++ )); do for (( j=i+1; j<$NUM_FILES; j++ )); do FILE1=${FILES[$i]} FILE2=${FILES[$j]} intersectBed -a $FILE1 -b $FILE2 -f 0.5 -r > proTRAC_results_APUL/${i}_${j}_merged.bed done done bedops -m *_merged.bed > APUL.merged.clusters.bed # Run ping pong ID on all mapped reads for f in *.weighted-10000-1000-b-0 do perl /scratch/jeirinlo/javirodr/sncRNA_E5_corals/code/NGSToolbox/TBr2_pingpong.pl \ -i ${f} \ -o ${f}.pp done ```