# Required Programs fastqc multiqc fastp samtools hisat2 stringtie # RNASeq Files #### File location PolyA ##### P. acuta ```/RAID_STORAGE2/hputnam/20191104_HoloInt/30-274849628``` 1041_R1_001.fastq.gz 1041_R2_001.fastq.gz 1471_R1_001.fastq.gz 1471_R2_001.fastq.gz 1637_R1_001.fastq.gz 1637_R2_001.fastq.gz ##### M. capitata ```/RAID_STORAGE2/hputnam/20200404_HoloInt_Batch3``` 1101_R1_001.fastq.gz 1101_R2_001.fastq.gz 1548_R1_001.fastq.gz 1548_R2_001.fastq.gz 1628_R1_001.fastq.gz 1628_R2_001.fastq.gz #### Sample Plug IDs PA_1041 PA_1471 PA_1637 MC_1101 MC_1548 MC_1628 # Main Folder ### Obtain Mcapitata Genome and GFF `wget http://cyanophora.rutgers.edu/montipora/Mcap.genome_assembly.fa.gz` `wget http://cyanophora.rutgers.edu/montipora/Mcap.GFFannotation.gff` ### Use script to fix the gff format issues introduced ### from the combination of AUGUSTUS and GeMoMa genes Fix_GeMoMa.R #Load Mcap gene gff Mcap.gff <- read.csv(file="genome-feature-files/Mcap.GFFannotation.gff.1", header=FALSE, sep="\t") #rename columns colnames(Mcap.gff) <- c("scaffold", "Gene.Predict", "id", "gene.start","gene.stop", "pos1", "pos2","pos3", "gene") Mcap.gff$gene <- gsub("_cds[0123456789]", ".t1", Mcap.gff$gene) #change the second part of the GeMoMa genes from cds to t.1 to match Augustus Mcap.gff$gene <-sub("(;[^;]+);.*", "\\1", Mcap.gff$gene) #remove everything after the second ; in the gene column Mcap.gff$gene <- gsub("Parent=", " gene_id ", Mcap.gff$gene) #remove ID= from GeMoMa genes #If id ==CDS replace ID= with transcript_id, else replace with nothing Mcap.gff <- Mcap.gff %>% mutate(gene = ifelse(Gene.Predict %in% c("GeMoMa") & id == "CDS" , gsub("ID=", "transcript_id ", gene), gsub("ID=", "", gene))) #If id ==gene remove everything after the ; else replace with nothing Mcap.gff <- Mcap.gff %>% mutate(gene = ifelse(Gene.Predict %in% c("GeMoMa") & id == "gene" , gsub(";.*", "", gene), gsub("ID=", "", gene))) # sub to add quotes around the transcript name Mcap.gff$gene <- gsub("transcript_id ", "transcript_id \"", Mcap.gff$gene) Mcap.gff$gene <- gsub(";", "\";", Mcap.gff$gene) #add quotes before the gene_id Mcap.gff$gene <- gsub("gene_id ", "gene_id \"", Mcap.gff$gene) #If id ==CDS add "; at the end else replace with nothing Mcap.gff <- Mcap.gff %>% mutate(gene = ifelse(id == "CDS" , paste0(gene, "\";"), paste0(gene, ""))) #save file write.table(Mcap.gff, file="/Users/hputnam/MyProjects/Meth_Compare/genome-feature-files/Mcap.GFFannotation.fixed.gff", sep="\t", col.names = FALSE, row.names=FALSE, quote=FALSE) ### Obtain Pacuta Genome and GFF `scp -P 2292 /Users/hputnam/Downloads/Pacuta_genome/Pocillopora_acuta_genome_v1.fasta hputnam@kitt.uri.edu:/home/hputnam/Meth_Compare_RNASeq` `scp -P 2292 /Users/hputnam/Downloads/Pacuta_genome/Structural_annotation_abintio.gff hputnam@kitt.uri.edu:/home/hputnam/Meth_Compare_RNASeq` `mv Structural_annotation_abintio.gff Pact_Structural_annotation_abintio.gff` `sed 's/cov/_cov/' Pact_Structural_annotation_abintio.gff > Pact.GFFannotation.gff` # Link Raw from RAID2 #### File location PolyA ```/RAID_STORAGE2/hputnam/20191104_HoloInt/30-274849628``` # QC raw files `fastqc *.fastq.gz` `multiqc .` # Trimming and QC "1041" "1101" "1471" "1548" "1628" "1637" ``` sh -c 'for file in "1041" "1101" "1471" "1548" "1628" "1637" do fastp \ --in1 ${file}_R1_001.fastq.gz \ --in2 ${file}_R2_001.fastq.gz \ --out1 ${file}_R1_clean.fastq.gz \ --out2 ${file}_R2_clean.fastq.gz \ --failed_out cleaned_reads/${file}_failed.txt \ --qualified_quality_phred 20 \ --unqualified_percent_limit 10 \ --length_required 50 detect_adapter_for_pe \ --cut_right cut_right_window_size 5 cut_right_mean_quality 20 done' ``` # QC trimmed files fastqc 1637*clean.fastq.gz multiqc . # Download and view files scp -r -P 2292 hputnam@kitt.uri.edu:/home/hputnam/Meth_Compare_RNASeq/multiqc_report.html /Users/hputnam/MyProjects/Meth_Compare/RNASeq/PolyA # Align Reads to Reference Genome *HISAT2 is a fast and sensitive alignment program for mapping next-generation DNA and RNA sequencing reads to a reference genome.* ## Index the reference genome Index the reference genome in the reference directory. ++HISAT2-build Alignment Arguments Used++: - - name of reference files - - basename of index files to write - -f - reference file is a FASTA file ``` hisat2-build Mcap.genome_assembly.fa Mcap_ref hisat2-build Pocillopora_acuta_genome_v1.fasta Pact_ref ``` # Alignment of clean reads to the reference genome ### Aligning paired end reads rna strandedness - RF for Illumina https://github.com/griffithlab/rnaseq_tutorial/blob/master/manuscript/supplementary_tables/supplementary_table_5.md ## Mcap Mapping for PolyA "1101" "1548" "1628" ` sh -c 'for i in "1101" do hisat2 -p 8 --dta -q -x Mcap_ref \ --rna-strandness RF \ -1 ${i}_R1_clean.fastq.gz \ -2 ${i}_R2_clean.fastq.gz -S ${i}.sam done' ` ## Pact Mapping for PolyA "1041" "1471" "1637" ` sh -c 'for i in "1637" do hisat2 -p 8 --dta -q -x Pact_ref \ --rna-strandness RF \ -1 ${i}_R1_clean.fastq.gz \ -2 ${i}_R2_clean.fastq.gz -S ${i}.sam done' ` # Convert and sort Sam to BAM ## RiboDep ## PolyA "1041" "1471" "1637" "1101" "1548" "1628" ` sh -c 'for i in "1637" do samtools sort -@ 8 -o ${i}.bam ${i}.sam done' ` # Remove Sam ` rm *.sam ` # Assemble aligned reads and quantify transcripts --- *StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts.* - Merged-GTF guided assembly without novel transcript discovery - Compilation of GTF-files into gene and transcript count matrices For our assembly, we will have to run StringTie. This run is necessary in order to compile our GTF files into gene count matrices that we will need for our differential expression analysis, because the StringTie script that compiles the GTF files ```prepDE.py``` only runs if the ```-e``` option is "on" during the previous assembly. # Reference-guided assembly to genes only ++StringTie Arguments Used++: - -p - Specify number of processers - -G - Specify annotation file - -o - Name of output file - -e - only estimate the abundance of given reference transcripts (requires -G) #Mcap PolyA Ignore Novel ` stringtie 1101.bam -p 8 -A 1101_gene_abund.tab -e -G Mcap.GFFannotation.fixed.gff -o 1101.gtf stringtie 1548.bam -p 8 -A 1548_gene_abund.tab -e -G Mcap.GFFannotation.fixed.gff -o 1548.gtf stringtie 1628.bam -p 8 -A 1628_gene_abund.tab -e -G Mcap.GFFannotation.fixed.gff -o 1628.gtf ` #Pact PolyA Ignore Novel ` stringtie 1041.bam -p 8 -A 1041_gene_abund.tab -e -G Pact.GFFannotation.gff -o 1041.gtf stringtie 1471.bam -p 8 -A 1471_gene_abund.tab -e -G Pact.GFFannotation.gff -o 1471.gtf stringtie 1637.bam -p 8 -A 1637_gene_abund.tab -e -G Pact.GFFannotation.gff -o 1637.gtf ` #### Compilation of GTF-files into gene and transcript count matrices The StringTie program includes a script, ```prepDE.py``` that compiles your assembly files into gene and transcript count matrices. This script requires as input the list of sample names and their full file paths, (sample_list.txt). . Run ```prepDE.py``` to merge assembled files together into a DESeq2-friendly version. ++StringTie prepDE.py Arguments Used++: - -i - Specify that input is a TXT file - -g - Require output gene count file, default name is ```gene_count_matrix.csv``` ``` # Mcap PolyA nano Mcap_sample_list.txt 1101 1101.gtf 1548 1548.gtf 1628 1628.gtf ` prepDE.py -i Mcap_sample_list.txt -g Mcap_gene_count_matrix.csv ` # Pact PolyA nano Pact_sample_list.txt 1041 1041.gtf 1471 1471.gtf 1637 1637.gtf ` prepDE.py -i Pact_sample_list.txt -g Pact_gene_count_matrix.csv ` #download mapping and counts matrices scp -r -P 2292 hputnam@kitt.uri.edu:/home/hputnam/Meth_Compare_RNASeq/*.gtf /Users/hputnam/MyProjects/Meth_Compare/RNASeq/PolyA scp -r -P 2292 hputnam@kitt.uri.edu:/home/hputnam/Meth_Compare_RNASeq/*.csv /Users/hputnam/MyProjects/Meth_Compare/RNASeq/PolyA scp -r -P 2292 hputnam@kitt.uri.edu:/home/hputnam/Meth_Compare_RNASeq/*gene_abund.tab /Users/hputnam/MyProjects/Meth_Compare/RNASeq/PolyA