--- title: "lncRNA Comparison" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(formattable) library(Biostrings) library(spaa) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` ## grabbing 3 fastas.. ``` ../../D-Apul/output/05.33-lncRNA-discovery/Apul_lncRNA_candidates.fasta ../../E-Peve/output/05-lncRNA-discovery/Peve_lncRNA_candidates.fasta ../../F-Pmea/output/02-lncRNA-discovery/Pmea_lncRNA_candidates.fasta ``` ```{r, engine='bash', eval=TRUE} head ../../D-Apul/output/05.33-lncRNA-discovery/Apul_lncRNA_candidates.fasta ``` ```{r, engine='bash', eval=TRUE} head ../../E-Peve/output/05-lncRNA-discovery/Peve_lncRNA_candidates.fasta ``` ```{r, engine='bash', eval=TRUE} head ../../F-Pmea/output/02-lncRNA-discovery/Pmea_lncRNA_candidates.fasta ``` ## length distribution ```{r, eval=TRUE} # Read FASTA file fasta_file <- "../../D-Apul/output/05.33-lncRNA-discovery/Apul_lncRNA_candidates.fasta" # Replace with the name of your FASTA file sequences <- readDNAStringSet(fasta_file) # Calculate sequence lengths sequence_lengths <- width(sequences) # Create a data frame sequence_lengths_df <- data.frame(Length = sequence_lengths) # Plot histogram using ggplot2 ggplot(sequence_lengths_df, aes(x = Length)) + geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) + labs(title = "Histogram of Sequence Lengths", x = "Sequence Length", y = "Frequency") + theme_minimal() ``` ```{r, eval=TRUE} # Read FASTA file fasta_file <- "../../E-Peve/output/05-lncRNA-discovery/Peve_lncRNA_candidates.fasta" # Replace with the name of your FASTA file sequences <- readDNAStringSet(fasta_file) # Calculate sequence lengths sequence_lengths <- width(sequences) # Create a data frame sequence_lengths_df <- data.frame(Length = sequence_lengths) # Plot histogram using ggplot2 ggplot(sequence_lengths_df, aes(x = Length)) + geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) + labs(title = "Histogram of Sequence Lengths", x = "Sequence Length", y = "Frequency") + theme_minimal() ``` ```{r, eval=TRUE} # Read FASTA file fasta_file <- "../../F-Pmea/output/02-lncRNA-discovery/Pmea_lncRNA_candidates.fasta" # Replace with the name of your FASTA file sequences <- readDNAStringSet(fasta_file) # Calculate sequence lengths sequence_lengths <- width(sequences) # Create a data frame sequence_lengths_df <- data.frame(Length = sequence_lengths) # Plot histogram using ggplot2 ggplot(sequence_lengths_df, aes(x = Length)) + geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) + labs(title = "Histogram of Sequence Lengths", x = "Sequence Length", y = "Frequency") + theme_minimal() ``` ## counts ```{r, engine='bash', eval=TRUE} fgrep ">" -c ../../D-Apul/output/05.33-lncRNA-discovery/Apul_lncRNA_candidates.fasta fgrep ">" -c ../../E-Peve/output/05-lncRNA-discovery/Peve_lncRNA_candidates.fasta fgrep ">" -c ../../F-Pmea/output/02-lncRNA-discovery/Pmea_lncRNA_candidates.fasta ``` ## Peve Count matrix ### avg expression distribution https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/peve_lncRNA.isoform.counts.matrix ```{r, engine='bash'} cd ../data curl -O https://raw.githubusercontent.com/zbengt/coral-lncRNA/main/ouput/peve_lncRNA.isoform.counts.matrix ``` ```{r, eval=TRUE} pevect <- read.csv("../data/peve_lncRNA.isoform.counts.matrix", sep = '\t') ``` ```{r, eval=TRUE} pevect %>% rowwise() %>% mutate(avg = mean(c_across(2:5))) %>% ggplot(aes(x = avg)) + geom_histogram(bins = 100, fill = "blue", color = "white", alpha = 0.7) + xlim(0, 100) + labs(title = "Histogram of Average Column", x = "Average Expression Value", y = "Frequency") + theme_minimal() ``` ## blast comparison ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/apul_bedtools_lncRNAs.fasta \ -dbtype nucl \ -out ../data/blast/apul_bedtools_lncRNAs ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/peve_bedtools_lncRNAs.fasta \ -dbtype nucl \ -out ../data/blast/peve_bedtools_lncRNAs ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \ -in ../data/pmea_bedtools_lncRNAs.fasta \ -dbtype nucl \ -out ../data/blast/pmea_bedtools_lncRNAs ``` apul_bedtools_lncRNAs peve_bedtools_lncRNAs pmea_bedtools_lncRNAs ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastn \ -task blastn \ -query ../data/apul_bedtools_lncRNAs.fasta \ -db ../data/blast/peve_bedtools_lncRNAs \ -out ../output/apul_peve_blastn.tab \ -evalue 1E-40 \ -num_threads 40 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 wc -l ../output/apul_peve_blastn.tab ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastn \ -task blastn \ -query ../data/apul_bedtools_lncRNAs.fasta \ -db ../data/blast/pmea_bedtools_lncRNAs \ -out ../output/apul_pmea_blastn.tab \ -evalue 1E-40 \ -num_threads 40 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 wc -l ../output/apul_pmea_blastn.tab ``` ```{r, engine='bash'} /home/shared/ncbi-blast-2.11.0+/bin/blastn \ -task blastn \ -query ../data/peve_bedtools_lncRNAs.fasta \ -db ../data/blast/pmea_bedtools_lncRNAs \ -out ../output/peve_pmea_blastn.tab \ -evalue 1E-40 \ -num_threads 40 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 wc -l ../output/peve_pmea_blastn.tab ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/*tab ```