curl https://gannet.fish.washington.edu/seashell/snaps/Gadus_macrocephalus.coding.gene.V1.cds \
\
-k > ../data/Gadus_macrocephalus.coding.gene.V1.cds
Exploring what fasta file
head -3 ../data/Gadus_macrocephalus.coding.gene.V1.cds
## >Gma_1G0000010.1 locus=chr1:81612:97483:+ len:2343
## ATGCCTGTGAACGCGCGGGACCGGACAGTGCTGGGGCGTTTCCCCGGGGTCACGCTGGAA
## CCGGTGGAGGAGGAGGTGGAGGAGGAGGAGGAGGTGGAAGAGGACCAGGTGGAGCGAGGC
echo "How many sequences are there?"
grep -c ">" ../data/Gadus_macrocephalus.coding.gene.V1.cds
## How many sequences are there?
## 23843
# Read FASTA file
<- "../data/Gadus_macrocephalus.coding.gene.V1.cds" # Replace with the name of your FASTA file
fasta_file <- readDNAStringSet(fasta_file)
sequences
# Calculate sequence lengths
<- width(sequences)
sequence_lengths
# Create a data frame
<- data.frame(Length = sequence_lengths)
sequence_lengths_df
# Plot histogram using ggplot2
ggplot(sequence_lengths_df, aes(x = Length)) +
geom_histogram(binwidth = 1, color = "grey", fill = "blue", alpha = 0.75) +
labs(title = "Histogram of Sequence Lengths",
x = "Sequence Length",
y = "Frequency") +
theme_minimal()
# Read FASTA file
<- "../data/Gadus_macrocephalus.coding.gene.V1.cds"
fasta_file <- readDNAStringSet(fasta_file)
sequences
# Calculate base composition
<- alphabetFrequency(sequences, baseOnly = TRUE)
base_composition
# Convert to data frame and reshape for ggplot2
<- as.data.frame(base_composition)
base_composition_df $ID <- rownames(base_composition_df)
base_composition_df<- reshape2::melt(base_composition_df, id.vars = "ID", variable.name = "Base", value.name = "Count")
base_composition_melted
# Plot base composition bar chart using ggplot2
ggplot(base_composition_melted, aes(x = Base, y = Count, fill = Base)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(title = "Base Composition",
x = "Base",
y = "Count") +
theme_minimal() +
scale_fill_manual(values = c("A" = "green", "C" = "blue", "G" = "yellow", "T" = "red"))
# Read FASTA file
<- "../data/Gadus_macrocephalus.coding.gene.V1.cds"
fasta_file <- readDNAStringSet(fasta_file)
sequences
# Count CG motifs in each sequence
<- function(sequence) {
count_cg_motifs <- "CG"
cg_motif return(length(gregexpr(cg_motif, sequence, fixed = TRUE)[[1]]))
}
<- sapply(sequences, count_cg_motifs)
cg_motifs_counts
# Create a data frame
<- data.frame(CG_Count = cg_motifs_counts)
cg_motifs_counts_df
# Plot CG motifs distribution using ggplot2
ggplot(cg_motifs_counts_df, aes(x = CG_Count)) +
geom_histogram(binwidth = 1, color = "black", fill = "blue", alpha = 0.75) +
labs(title = "Distribution of CG Motifs",
x = "Number of CG Motifs",
y = "Frequency") +
theme_minimal()
This is from here picur reviewe sequences I named based on the identify of the database given
cd ../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_04.fasta.gz
gunzip -k uniprot_sprot_r2023_04.fasta.gz
mkdir ../blastdb
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
\
-in ../data/uniprot_sprot_r2023_04.fasta \
-dbtype prot -out ../blastdb/uniprot_sprot_r2023_04
/home/shared/ncbi-blast-2.11.0+/bin/blastx \
\
-query ../data/Gadus_macrocephalus.coding.gene.V1.cds \
-db ../blastdb/uniprot_sprot_r2023_04 \
-out ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 -outfmt 6
head -2 ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx.tab
## Gma_1G0000010.1 sp|P22735|TGM1_HUMAN 50.659 683 318 7 328 2334 109 786 0.0 688
## Gma_1G0000020.1 sp|Q9JI35|HRH3_CAVPO 54.684 395 160 4 136 1266 50 443 1.98e-140 411
echo "Number of lines in output"
wc -l ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx.tab
## Number of lines in output
## 22481 ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx.tab
tr '|' '\t' < ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx.tab \
> ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx_sep.tab
head -1 ../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx_sep.tab
## Gma_1G0000010.1 sp P22735 TGM1_HUMAN 50.659 683 318 7 328 2334 109 786 0.0 688
<- read.csv("../output/03-transcriptome-annotation/Gm.cds-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)
bltabl
<- read.csv("https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE) spgo
datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))
datatable(
left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)
# %>% mutate(V1 = str_replace_all(V1,pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab"))
)